cuda - Zero-copy memory, memory-mapped file

Question

I am trying to create a mapped memory file, containing uint32_ts, and then use that as zero-copy pinned memory as shown below for CUDA. I am getting the cudaErrorInvalidValue when getting the device pointer, having allocated space and mapped the memory from file. I know the error message (from the API) means :

This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.

But I'm struggling to figure out why I'm having this problem.... Any ideas? Thanks in advance.

#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

    …

int main(void) 
{
  struct stat buf;

    …

  uint32_t *data, *dev_data;

  cudaDeviceProp cuda_prop;
  cudaGetDeviceProperties(&cuda_prop, 0);
  if (!cuda_prop.canMapHostMemory) 
    exit(EXIT_FAILURE);

  cudaSetDeviceFlags(cudaDeviceMapHost);


  int data_file = open(data_file_name, O_RDONLY);
  int stat = fstat(sa_file, &buf);
  int data_file_size = buf.st_size;

  err = cudaHostAlloc((void**)&data, data_file_size, cudaHostAllocMapped);
  if (err == cudaErrorMemoryAllocation) exit(EXIT_FAILURE);

  data = (uint32_t*) mmap(0, data_file_size, PROT_READ, MAP_PRIVATE, data_file, 0);

  err = cudaHostGetDevicePointer((void**)&dev_data, (void*)data, 0);
  if (err == cudaErrorMemoryAllocation)
  {
    printf("cudaHostGetDevicePointer - Mem Alloc Err\n"); 
    exit(EXIT_FAILURE);
  }
  else if (err == cudaErrorInvalidValue) //ERROR HERE.
  {
    printf("cudaHostGetDevicePointer - Invalid Val Err\n"); 
    exit(EXIT_FAILURE);
  }

    …

}

`cudaHostAlloc` assigned a value to `data`, and then you overwrote that value with your `data = mmap(...)` line. `cudaHostGetDevicePointer` has no idea what to do with the new value of `data` provided by `mmap` since that value was not provided by the CUDA API. You could try getting rid of the `cudaHostAlloc` line, and then do a `cudaHostRegister` on `data` after the `mmap` line. I don't know if that will work or not. — Robert Crovella, Apr 08 '15 at 15:34
I'd be very surprised if the GPU driver could register a virtual allocation from a mmaped file — talonmies, Apr 08 '15 at 15:47

score 4 · Accepted Answer · edited Aug 20 '21 at 23:28

One problem is that the logical sequence of your program is incorrect. This line assigns a value to data provided by the CUDA API:

err = cudaHostAlloc((void**)&data, data_file_size, cudaHostAllocMapped);

This line then overwrites that value, with a new one:

data = (uint32_t*) mmap(0, data_file_size, PROT_READ, MAP_PRIVATE, data_file, 0);

At that point, the value of data is not recognized by the CUDA API as being a pinned memory space anymore, so when you call this:

err = cudaHostGetDevicePointer((void**)&dev_data, (void*)data, 0);

you get an error, because the value contained in data is not recognized.

EDIT: (based on this question) Apart from that issue, it seems that if you change the file handling from read-only, to read-write, then this process can be made to work (throws no runtime errors). Here's a complete code (which doesn't contain the above logical flaw) that demonstrates this (I have previously created a test.dat file of size 566316 bytes):

$ cat t706.cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>

int main(void)
{
  struct stat buf;

  char *dev_data;

  cudaDeviceProp cuda_prop;
  cudaGetDeviceProperties(&cuda_prop, 0);
  if (!cuda_prop.canMapHostMemory)
    exit(EXIT_FAILURE);

  cudaSetDeviceFlags(cudaDeviceMapHost);


  int data_file = open("test.dat", O_RDWR);
  int stat = fstat(data_file, &buf);
  int data_file_size = buf.st_size;
  printf("data_file_size = %d\n", data_file_size);
  char *data = (char *) mmap(0, data_file_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, data_file, 0);
  if (data == MAP_FAILED) {
    printf("mmap failure\n");
    exit(EXIT_FAILURE);}
  cudaError_t err = cudaHostRegister(data, data_file_size, cudaHostRegisterDefault);
  if (err != cudaSuccess) { //ERROR HERE.
    printf("cudaHostRegister fail\n");
    exit(EXIT_FAILURE);}

  err = cudaHostGetDevicePointer((void**)&dev_data, (void*)data, 0);
  if (err == cudaErrorMemoryAllocation)
  {
    printf("cudaHostGetDevicePointer - Mem Alloc Err\n");
    exit(EXIT_FAILURE);
  }
  else if (err == cudaErrorInvalidValue)
  {
    printf("cudaHostGetDevicePointer - Invalid Val Err\n");
    exit(EXIT_FAILURE);
  }

}
$ nvcc -arch=sm_30 -o t706 t706.cu
$ ./t706
data_file_size = 566316
$

Thank you for trying, and for correcting the host allocation logic! I suppose the easiest way around this is probably `fread`ing instead of using `mmap`... — PidgeyBAWK, Apr 08 '15 at 16:18
Passing the flag `cudaHostRegisterReadOnly` to `cudaHostRegister()` allows the memory to be mapped without needing to be marked `PROT_WRITE`. This not only makes the coded cleaner, it also appears to confer a decent speed bump, at least in my test setup. — John Kugelman, Aug 21 '21 at 00:29

cuda - Zero-copy memory, memory-mapped file

1 Answers1

Linked