Is there any way I can call CUDA runtime function calls such as
cudaMemcpy(...);
in a .cpp file, compiled with a regular C++ compiler?
Is there any way I can call CUDA runtime function calls such as
cudaMemcpy(...);
in a .cpp file, compiled with a regular C++ compiler?
EDIT: There was an example here but it's not longer found, but most of the example was copied below.
The caller C (but could be C++)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
extern void kernel_wrapper(int *a, int *b);
int main(int argc, char *argv[])
{
int a = 2;
int b = 3;
kernel_wrapper(&a, &b);
return 0;
}
The Callee (CUDA)
__global__ void kernel(int *a, int *b)
{
int tx = threadIdx.x;
switch( tx )
{
case 0:
*a = *a + 10;
break;
case 1:
*b = *b + 3;
break;
default:
break;
}
}
void kernel_wrapper(int *a, int *b)
{
int *d_1, *d_2;
dim3 threads( 2, 1 );
dim3 blocks( 1, 1 );
cudaMalloc( (void **)&d_1, sizeof(int) );
cudaMalloc( (void **)&d_2, sizeof(int) );
cudaMemcpy( d_1, a, sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_2, b, sizeof(int), cudaMemcpyHostToDevice );
kernel<<< blocks, threads >>>( a, b );
cudaMemcpy( a, d_1, sizeof(int), cudaMemcpyDeviceToHost );
cudaMemcpy( b, d_2, sizeof(int), cudaMemcpyDeviceToHost );
cudaFree(d_1);
cudaFree(d_2);
}
Similarly to @PreetSangha (who provided a very useful answer), I had some issues when running it as extern ...
so I would just like to add the solution which worked for me (including templated function calls).
This is the code for my example (the full CUDA code is excluded because it is already in @PreetSangha's example) and is supposed to give a main idea on how it worksr. It was compiled and confirmed to run on a linux machine. I haven't tried it on windows yet but should be similar. In my scenario I wanted to try int
, float
and double
but more templates could be added.
// main.cpp
#include "wrapper.hpp"
int main(int argc, char *argv[]) {
runOnGPU(1,2,3);
}
// cuda.cu
#include "wrapper.hpp"
template<typename T>
__global__ static void matMultCUDA(const T* a, const T* b, T* c, int n) {
int col = threadIdx.x + blockIdx.x * blockDim.x;
int row = threadIdx.y + blockIdx.y * blockDim.y;
T value = 0;
if(col < n && row < n)
for(int j=0; j < n; j++){
value += a[row*n + j] * b[j*n+col];
}
c[row*n + col] = value;
}
bool InitCUDA(bool b) {
/* CUDA Initialization */
}
template<typename T>
float runOnGPU(T *a, T *b, int n) {
/* Do CUDA things here :D */
matMultCUDA<<<dimGrid, dimBlock>>>(cuda_a , cuda_b , cuda_c , n);
}
template float runOnGPU<int>(int* a, int* b, int n);
template float runOnGPU<float>(float* a, float* b, int n);
template float runOnGPU<double>(double* a, double* b, int n);
// wrapper.hpp
bool InitCUDA(bool b);
template<typename T>
float runOnGPU(T *a, T *b, int n);
# makefile
CXX = g++
CXXFLAGS = -O3
NVCC = nvcc
NVCCFLAGS = -O3
LDFLAGS = -lcudart
OBJS = main.o cuda.o
all: program
program: $(OBJS)
$(CXX) $(CXXFLAGS) -L/usr/local/cuda-11/lib64 cuda.o main.o -o program.out $(LDFLAGS)
main.o: main.cpp wrapper.hpp
$(CXX) $(CXXFLAGS) -c main.cpp
cuda.o: cuda.cu wrapper.hpp
$(NVCC) $(NVCCFLAGS) -c cuda.cu