As per the documentation:
In addition, device malloc() memory cannot be used in any runtime or driver API calls (i.e. cudaMemcpy, cudaMemset, etc).
To confirm this, let's run your code with cuda-memcheck
:
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.cu
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 36791296, h_data[1] = 0
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
========= Host Frame:./heapcopy [0x3cb0a]
========= Host Frame:./heapcopy [0x31ac]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:./heapcopy [0x2fd9]
=========
========= Program hit cudaErrorInvalidDevicePointer (error 17) due to "invalid device pointer" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
========= Host Frame:./heapcopy [0x44f00]
========= Host Frame:./heapcopy [0x31dc]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:./heapcopy [0x2fd9]
=========
========= ERROR SUMMARY: 2 errors
This is why your code fails -- the address at h_buffer->data
is not host API accessible. Note also that it can't be free'd from the host either.
You could do something like this, which uses a managed memory allocation as the host memory (so it is directly accessible within the kernel), and a device side cudaMemcpyAsync
call:
#include <cuda_runtime.h>
#include <stdio.h>
class OutputData {
public:
int *data;
};
__global__ void init(OutputData *buffer)
{
// allocate memory for data
buffer->data = (int*) malloc(sizeof(int)*2);
// write data
buffer->data[0] = 1;
buffer->data[1] = 2;
}
__global__ void deepcopy(OutputData* dest, OutputData* source, size_t datasz)
{
cudaMemcpyAsync(dest->data, source->data, datasz, cudaMemcpyDeviceToDevice);
}
int main(int argc, char **argv)
{
// malloc device memory
OutputData *d_buffer;
cudaMalloc(&d_buffer, sizeof(OutputData));
// run kernel
init<<<1,1>>>(d_buffer);
cudaDeviceSynchronize();
// malloc host memory as managed memory
//OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
//int* h_data = (int*) malloc(sizeof(int)*2);
size_t dsize = sizeof(int)*2;
OutputData* h_buffer; cudaMallocManaged(&h_buffer, sizeof(OutputData));
int* h_data; cudaMallocManaged(&h_data, dsize);
h_buffer->data = h_data;
// run kernel
deepcopy<<<1,1>>>(h_buffer, d_buffer, dsize);
cudaDeviceSynchronize();
// Print the data
printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);
// free memory
cudaFree(h_data);
cudaFree(h_buffer);
cudaFree(d_buffer);
return (0);
}
Which runs as expected (note there is technically a device heap memory leak here because a device side free
call is never made):
$ nvcc -std=c++11 -arch=sm_52 -dc -o heapcopy.o heapcopy.cu
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.o
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 1, h_data[1] = 2
========= ERROR SUMMARY: 0 errors
There are other variations (like building a complete mirror structure of the heap structure in global memory from the host and then running the copy kernel), but those make even less sense than this does.