I'm trying to make a simple Python/CUDA interface using CFFI. However, I'm consistently getting an "invalid argument" error during the data retrieval stage in the cudaMemcpyDeviceToHost. Can anyone spot the issue? I'd really appreciate any help anyone can offer.
The code I'm wanting to wrap is just called "array.cu":
// array.cu
#include "array.h"
using namespace std;
void allocate(
float* host_array,
float* device_array,
int length
) {
cout << "Allocating h_ptr (" << host_array << ") ";
cout << "on device using d_ptr (" << device_array << ") ";
cout << "of length=" << length << endl;
CUCHK(cudaMalloc((void**) &device_array, length*sizeof(float)));
CUCHK(cudaMemcpy(device_array, host_array, length*sizeof(float), cudaMemcpyHostToDevice));
}
void retrieve(
float* device_array,
float* host_array,
int length
) {
cout << "Retrieving h_ptr (" << host_array << ") ";
cout << "from device using d_ptr (" << device_array << ") ";
cout << "of length=" << length << endl;
CUCHK(cudaMemcpy(host_array, device_array, length*sizeof(float), cudaMemcpyDeviceToHost));
}
And I've written a wrapper script in the form of a library I call "cupid":
# cupid.py
import numpy as np
from cffi import FFI
ffi = FFI()
lib = ffi.dlopen("./cupid/src/libAlg.so")
class cupid:
def __init__(self, numpy_array):
self._numpy_array = numpy_array
self._host_array = ffi.cast("float *", np.ascontiguousarray(numpy_array, np.float32).ctypes.data)
self._device_array = ffi.new("float *")
self._length = numpy_array.size
self._shape = numpy_array.shape
self._dtype = numpy_array.dtype
self.allocate()
return
def allocate(self):
ffi.cdef(
"""
void allocate(
float *host_array,
float *device_array,
int length
);
""")
lib.allocate(self._host_array, self._device_array, self._length)
return
def retrieve(self):
ffi.cdef(
"""
void retrieve(
float* device_array,
float* host_array,
int length
);
""")
lib.retrieve(self._device_array, self._host_array, self._length)
self._numpy_array = np.frombuffer(ffi.buffer(self._host_array,
self._length*ffi.sizeof("float")), dtype=np.float32)
self._numpy_array = np.ctypeslib.as_array(self._numpy_array, shape=tuple(self._shape))
self._numpy_array = self._numpy_array.reshape(tuple(self._shape))
return self._numpy_array
which is being called as:
# test.py
import numpy as np
from cupid import cupid
from pprint import pprint
numpy_array = np.zeros((5,6), dtype=np.float32)
cupid_array = cupid(numpy_array)
numpy_array = cupid_array.retrieve()
Producing the output:
Allocating h_ptr (0x559552b81fa0) on device using d_ptr (0x559552b80fe0) of length=30
Retrieving h_ptr (0x559552b81fa0) from device using d_ptr (0x559552b80fe0) of length=30
Cuda error in file 'array.cu' in line 25 : invalid argument. (<- the cudaMemcpyDeviceToHost)
Looking at the memory address, I see the memory being handled is the same, and it's not being deleted in between stages - it just seems that that device array loses its CUDA credentials and isn't recognised as a device array anymore.