I have got an object say d_obj
that has some members on the unified memory and some members explicitly on the device memory. I then call a CUDA kernel that takes the object and works with it. I would like to immediately have CPU do some stuff with the members on the unified memory right after the kernel call, but that fails. Here I reproduce my problem using a short code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define CHECK_CUDA(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("ERROR:: File: %s, Line: %d, ", __FILE__, __LINE__); \
printf("code: %d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
}
class MyClass
{
public:
MyClass(int n_) : n(n_) { }
void allocateMeOnDevice() {
CHECK_CUDA(cudaMalloc((void**)&vec, n * sizeof(float)));
}
int n;
float* vec;
};
__global__ void kernel(MyClass* obj) {
for (int i = 0; i < obj->n; i++) {
obj->vec[i] = 1;
}
}
int main() {
int n = 1000;
MyClass h_obj(n);
MyClass* d_obj;
CHECK_CUDA(cudaMallocManaged((void**)&d_obj, sizeof(MyClass)));
CHECK_CUDA(cudaMemcpy(d_obj, &h_obj, sizeof(MyClass), cudaMemcpyHostToDevice));
d_obj->allocateMeOnDevice();
kernel << <1, 1 >> > (d_obj);
//CHECK_CUDA(cudaDeviceSynchronize());
printf("** d_obj->n is %d\n", d_obj->n); // <-- Read access violation if the above line is commented out
}
Is it not possible to access something on the unified memory from both host and device at the same time? I am wondering if there is any workaround for this problem?
OS: Windows 10/ CUDA 11.2/ Device: GeForce RTX 3090