I am moving GPU intensive operation from python to Cuda using Cython. But when I am trying to copy data from CPU (host) to GPU(device), I get CUDA Runtime API error 1 when performing the cudaMemcpy operation. I have tried everything for the last few days but I can not pass this error. Since the code I am working on is so large, So I managed to create a simple prototype for here. The main entry for code is from test.pyx file which is mentioned below
import numpy as np
cimport numpy as np
np.import_array()
cdef extern from "test.h":
cdef cppclass Test:
float* data
int msg_type
void test_call(Test vol)
def py_test():
cdef Test test
data = np.ones((1, 256, 256, 256), dtype=np.float32)
cdef float[:, :, :, ::1] data_view = data
test.data = &(data_view[0, 0, 0, 0])
test.msg_type = 0
test_call(test)
def test_gpu():
py_test()
Similarly, the cuda file is test.cu
#include "test.h"
void test_call(Test &test_cpu)
{
/**
* Allocate memory for GPU
*/
Test test_gpu;
mem_alloc_test(test_gpu, test_cpu);
printf("%f\n", test_cpu.msg_type);
printf("%f\n", test_gpu.msg_type);
printf("%f\n", test_cpu.data[0]);
printf("%f\n", test_gpu.data[0]);
mem_free_test(test_gpu);
}
, the header file is test.h
#include <iostream>
#include <math.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#define GLOBAL_2_HOST __host__ __device__
#define gpuErrchk(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
if(cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
const int THREADS_PER_BLOCK = 1024;
inline int getNumBlock(int N) { return (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; }
class Test {
public:
float *data;
int msg_type;
};
void test_call(Test &test);
void host_2_device(const float *host, float *device, int N) {
printf("%d", N);
gpuErrchk(cudaMemcpy(device, host, N*sizeof(float), cudaMemcpyHostToDevice));
}
void device_malloc(float **device, int N) {
gpuErrchk(cudaMalloc(device, N*sizeof(float)));
}
void mem_alloc_test(Test &test_gpu, Test &test_cpu) {
int N = 256 * 256 * 256;
device_malloc(&test_gpu.data, N);
host_2_device(test_cpu.data, test_gpu.data, N);
test_gpu.msg_type = test_cpu.msg_type;
}
void mem_free_test(Test &test_gpu) {
cudaFree(&test_gpu.data);
}
To build cuda code run from the root folder
mkdir -p build
cmake ..
make
To build cython code run python setup.py build_ext --inplace
from the root folder.
To test run python -c "import pytest; pytest.test_gpu()"
. You will be thrown a error CUDA Runtime API error 1: invalid argument.
Any help regarding this will be appriciated. If need any extra info, I'll be happy to provide that.