11

Is there any way I can call CUDA runtime function calls such as

cudaMemcpy(...);

in a .cpp file, compiled with a regular C++ compiler?

einpoklum
  • 118,144
  • 57
  • 340
  • 684
small_potato
  • 3,127
  • 5
  • 39
  • 45
  • Why don't you just try? :) And yes, this particular function cudaMemcpy() can be called from a C file. By the way I find cuda documentation terrible in that sort of details. – Slava Aug 29 '11 at 12:27
  • Following on from Preet's answer: you'll also need to link against `cudart` to satisfy the linker. I.e. nvcc -lcudart myfile.cpp – Edric Sep 28 '10 at 10:30

3 Answers3

19

EDIT: There was an example here but it's not longer found, but most of the example was copied below.

The caller C (but could be C++)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>

extern void kernel_wrapper(int *a, int *b);

int main(int argc, char *argv[])
{
   int a = 2;
   int b = 3;

   kernel_wrapper(&a, &b);

   return 0;
}

The Callee (CUDA)

__global__ void kernel(int *a, int *b)
{
   int tx = threadIdx.x;

   switch( tx )
   {
case 0:
    *a = *a + 10;
    break;
case 1:
    *b = *b + 3;
    break;
default:
    break;
   }
}

void kernel_wrapper(int *a, int *b)
{
   int *d_1, *d_2;
   dim3 threads( 2, 1 );
   dim3 blocks( 1, 1 );

   cudaMalloc( (void **)&d_1, sizeof(int) );
   cudaMalloc( (void **)&d_2, sizeof(int) );

   cudaMemcpy( d_1, a, sizeof(int), cudaMemcpyHostToDevice );
   cudaMemcpy( d_2, b, sizeof(int), cudaMemcpyHostToDevice );

   kernel<<< blocks, threads >>>( a, b );

   cudaMemcpy( a, d_1, sizeof(int), cudaMemcpyDeviceToHost );
   cudaMemcpy( b, d_2, sizeof(int), cudaMemcpyDeviceToHost );

   cudaFree(d_1);
   cudaFree(d_2);
}
Preet Sangha
  • 64,563
  • 18
  • 145
  • 216
0

you can use

g++ I/usr/local/cuda/include filename.cpp -o obj -L/usr/local/cuda/lib64 -lcudart

for compile or

nvcc filename.cu
Richard
  • 56,349
  • 34
  • 180
  • 251
9113303
  • 852
  • 1
  • 16
  • 30
0

Similarly to @PreetSangha (who provided a very useful answer), I had some issues when running it as extern ... so I would just like to add the solution which worked for me (including templated function calls).

This is the code for my example (the full CUDA code is excluded because it is already in @PreetSangha's example) and is supposed to give a main idea on how it worksr. It was compiled and confirmed to run on a linux machine. I haven't tried it on windows yet but should be similar. In my scenario I wanted to try int, float and double but more templates could be added.

// main.cpp
#include "wrapper.hpp"

int main(int argc, char *argv[]) {
    runOnGPU(1,2,3);
}
// cuda.cu
#include "wrapper.hpp"

template<typename T>
__global__ static void matMultCUDA(const T* a, const T* b, T* c, int n) {
    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    
    T value = 0;
    if(col < n && row < n)
        for(int j=0; j < n; j++){
            value += a[row*n + j] * b[j*n+col];
        }
        
    c[row*n + col] = value;
}

bool InitCUDA(bool b) {
    /* CUDA Initialization */
}

template<typename T>
float runOnGPU(T *a, T *b, int n) {
    /* Do CUDA things here :D */
    matMultCUDA<<<dimGrid, dimBlock>>>(cuda_a , cuda_b , cuda_c , n);
}


template float runOnGPU<int>(int* a, int* b, int n);
template float runOnGPU<float>(float* a, float* b, int n);
template float runOnGPU<double>(double* a, double* b, int n);
// wrapper.hpp

bool InitCUDA(bool b);

template<typename T>
float runOnGPU(T *a, T *b, int n);
# makefile
CXX = g++
CXXFLAGS = -O3
NVCC = nvcc
NVCCFLAGS = -O3

LDFLAGS = -lcudart

OBJS = main.o cuda.o

all: program

program: $(OBJS)
        $(CXX) $(CXXFLAGS) -L/usr/local/cuda-11/lib64 cuda.o main.o -o program.out $(LDFLAGS)

main.o: main.cpp wrapper.hpp
        $(CXX) $(CXXFLAGS) -c main.cpp

cuda.o: cuda.cu wrapper.hpp
        $(NVCC) $(NVCCFLAGS) -c cuda.cu
darclander
  • 1,526
  • 1
  • 13
  • 35