I have a GPU-only class T
which I want to create on GPU but have a reference to which on the CPU, so I can send the link as an argument to different CUDA kernels.
class T
{
public:
int v;
public:
__device__ T() { v = 10; }
__device__ ~T() {}
__device__ int compute() { return v; }
};
Here are the kernels that I was to create the class instance and to call the compute()
function.
__global__ void kernel(T* obj, int* out)
{
if(blockIdx.x * blockDim.x + threadIdx.x == 0) {
out[0] = obj->compute(); // no kernel error, but it returns garbage
}
}
__global__ void cudaAllocateGPUObj(T* obj)
{
if(blockIdx.x * blockDim.x + threadIdx.x == 0) {
obj = new T;
// if I call `out[0] = obj->compute();` here, everything works fine
}
}
The main function simply allocates memory for the pointer of type T*
which later is used as an argument for the cudaAllocateGPUObj
.
int main()
{
int cpu, *gpu;
cudaMalloc((void**)&gpu, sizeof(int));
T* obj;
cudaMalloc((void**)&obj, sizeof(T*));
cudaAllocateGPUObj<<<1,1>>>(obj);
kernel<<<1,1>>>(obj, gpu);
cudaMemcpy(&cpu, gpu, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("cudaMemcpy\nresult: %d\n", cpu);
return 0;
}
The problem with this code (as specified in the comments in the code) is that when I call out[0] = obj->compute();
in the cudaAllocateGPUObj
kernel and transfer the obtained value to the CPU, everything is correct. But if I want to obtain the member value in another kernel, it becomes garbage, though if I change the return value from the v
variable to a constant, everything works fine.
Could you please tell me what is wrong with this code.