0

In my code I want to allocate memory for a pointer data member of a class during kernel execution and write to it afterwards. Then I want to get this data on the host later. In my approach, however, I don't get the right data on the host (see below). Is my approach completely off or can you spot the erroneous part?

#include <cuda_runtime.h>
#include <stdio.h>

class OutputData {
public:
  int *data;
};

__global__ void init(OutputData *buffer)
{
  // allocate memory for data
  buffer->data = (int*) malloc(sizeof(int)*2);

  // write data
  buffer->data[0] = 1;
  buffer->data[1] = 2;
}

int main(int argc, char **argv)
{
  // malloc device memory
  OutputData *d_buffer;
  cudaMalloc(&d_buffer, sizeof(OutputData));

  // run kernel
  init<<<1,1>>>(d_buffer);
  cudaDeviceSynchronize();

  // malloc host memory
  OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));

  //transfer data from device to host
  cudaMemcpy(h_buffer, d_buffer, sizeof(OutputData), cudaMemcpyDeviceToHost);
  int* h_data = (int*) malloc(sizeof(int)*2);
  cudaMemcpy(h_data, h_buffer->data, sizeof(int)*2, cudaMemcpyDeviceToHost);

  // Print the data
  printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);

  // free memory
  cudaFree(h_buffer->data);
  free(h_buffer);
  cudaFree(d_buffer);
  free(h_data);

  return (0);
}

The output is

h_data[0] = 0, h_data[1] = 0

and not

h_data[0] = 1, h_data[1] = 2

as expected.

malo2784
  • 1
  • 1
  • 1
    Memory allocated on the device runtime heap using new or malloc cannot be accessed by the host APIs – talonmies Sep 15 '19 at 10:20
  • Any time you're having trouble with a CUDA code, its usually good practice to do [proper CUDA error checking](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) and also run your code with `cuda-memcheck`. I usually suggest people do these things **before** asking others for help. Even if you don't understand the error output, it will often be useful to those trying to help you. – Robert Crovella Sep 15 '19 at 14:05

1 Answers1

1

As per the documentation:

In addition, device malloc() memory cannot be used in any runtime or driver API calls (i.e. cudaMemcpy, cudaMemset, etc).

To confirm this, let's run your code with cuda-memcheck:

$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.cu 
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 36791296, h_data[1] = 0
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
=========     Host Frame:./heapcopy [0x3cb0a]
=========     Host Frame:./heapcopy [0x31ac]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:./heapcopy [0x2fd9]
=========
========= Program hit cudaErrorInvalidDevicePointer (error 17) due to "invalid device pointer" on CUDA API call to cudaFree. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
=========     Host Frame:./heapcopy [0x44f00]
=========     Host Frame:./heapcopy [0x31dc]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:./heapcopy [0x2fd9]
=========
========= ERROR SUMMARY: 2 errors

This is why your code fails -- the address at h_buffer->data is not host API accessible. Note also that it can't be free'd from the host either.

You could do something like this, which uses a managed memory allocation as the host memory (so it is directly accessible within the kernel), and a device side cudaMemcpyAsync call:

#include <cuda_runtime.h>
#include <stdio.h>

class OutputData {
public:
  int *data;
};

__global__ void init(OutputData *buffer)
{
  // allocate memory for data
  buffer->data = (int*) malloc(sizeof(int)*2);

  // write data
  buffer->data[0] = 1;
  buffer->data[1] = 2;
}

__global__ void deepcopy(OutputData* dest, OutputData* source, size_t datasz)
{
    cudaMemcpyAsync(dest->data, source->data, datasz, cudaMemcpyDeviceToDevice);
}

int main(int argc, char **argv)
{
  // malloc device memory
  OutputData *d_buffer;
  cudaMalloc(&d_buffer, sizeof(OutputData));

  // run kernel
  init<<<1,1>>>(d_buffer);
  cudaDeviceSynchronize();

  // malloc host memory as managed memory
  //OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
  //int* h_data = (int*) malloc(sizeof(int)*2);
  size_t dsize = sizeof(int)*2;
  OutputData* h_buffer; cudaMallocManaged(&h_buffer, sizeof(OutputData));
  int* h_data; cudaMallocManaged(&h_data, dsize);
  h_buffer->data = h_data;

  // run kernel
  deepcopy<<<1,1>>>(h_buffer, d_buffer, dsize);
  cudaDeviceSynchronize();

  // Print the data
  printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);

  // free memory
  cudaFree(h_data);
  cudaFree(h_buffer);
  cudaFree(d_buffer);

  return (0);
}

Which runs as expected (note there is technically a device heap memory leak here because a device side free call is never made):

$ nvcc -std=c++11 -arch=sm_52 -dc -o heapcopy.o heapcopy.cu 
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.o 
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 1, h_data[1] = 2
========= ERROR SUMMARY: 0 errors

There are other variations (like building a complete mirror structure of the heap structure in global memory from the host and then running the copy kernel), but those make even less sense than this does.

talonmies
  • 70,661
  • 34
  • 192
  • 269