Registering Mapped Linux Character Device Memory with cudaHostRegister Results in Invalid Argument

Question

I'm trying to boost DMA<->CPU<->GPU data transfer by: 1. Mapping my (proprietary) device Linux Kernel allocated memory to user space 2. Registering the later (mapped memory) to Cuda with cudaHostRegister API function.

While mapping User Space allocated memory mapped to my device DMA and then registered to Cuda with cudaHostRegister works just fine, trying to register "kmalloced" memory results in "Invalid Argument" error returned by cudaHostRegister.

First I thought the problem was with alignment or my device driver complicated memory pool management, so I've written a simplest character device which implements .mmap() where kzalloced 10Kb buffer is remapped with remap_pfn_range and the problem still stands.

Unfortunately, I did not find any resembling questions over the Net, so I sincerely hope I'll find an answer here.

Some system info and Kernel driver <-> user space app code + runtime log info:

CUDA    : 8.0
OS Dist : Ubuntu 14.04
Kernel  : 3.16.0-31-generic
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 375.26                 Driver Version: 375.26                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|                                                                               
|   0  GeForce GTX 770     Off  | 0000:83:00.0     N/A |                  N/A |
| 26%   32C    P8    N/A /  N/A |     79MiB /  1997MiB |     N/A      Default |
+-------------------------------+----------------------+----------------------+

Character device mmap() code:

#define MEM_CHUNK_SIZE  4 * _K
#define MEM_POOL_SIZE   10 * _K
/**/
static int  chdv_mmap(struct file *filp, struct vm_area_struct *vma)
{
    unsigned int pages_per_buf = ( MEM_CHUNK_SIZE >> PAGE_SHIFT ) ;
    unsigned long pfn, vsize;

    /*make sure the buffer is allocated*/
    if((NULL == g_membuff) && 
       (NULL == (g_membuff = kzalloc(MEM_POOL_SIZE , GFP_KERNEL))))
    {
        kdbgprintln("Error: Not enough memory");
        return -ENOMEM;
    }

    vsize = vma->vm_end - vma->vm_start ;

    kdbgprintln("MEM_CHUNK_SIZE %u, pages_per_buf %u, vsize %lu  vma->vm_pgoff %lu",
            MEM_CHUNK_SIZE,
            pages_per_buf,
            vsize,
            vma->vm_pgoff);
    if(vsize > MEM_POOL_SIZE)
    {
        kdbgprintln("Error: vsize %lu > MEM_POOL_SIZE %u", vsize, MEM_POOL_SIZE);
        return -EINVAL;
    }

    /* We allow only mapping of one whole buffer so offset must be multiple
     * of pages_per_buf and size must be equal to dma_buf_size.
     */
    if( vma->vm_pgoff % pages_per_buf ) 
    {
        kdbgprintln("Error:Mapping DMA buffers is allowed only from beginning");
        return -EINVAL ;
    }

    vma->vm_flags = vma->vm_flags | (VM_DONTEXPAND | VM_LOCKED | VM_IO);

    /*Get the PFN for remap*/
    pfn = page_to_pfn(virt_to_page((unsignedcudaHostRegister  char *)g_membuff));

    kdbgprintln("PFN : %lu", pfn);

    if(remap_pfn_range(vma, vma->vm_start, pfn, vsize, vma->vm_page_prot))
    {
        kdbgprintln("Error:Failed to remap memory");
        return -EINVAL;
    }

    /*Sealing data header & footer*/
    *((unsigned long *)g_membuff)       = 0xCDFFFFFFFFFFFFAB;
    *((unsigned long *)g_membuff + 1)   = 0xAB000000000000EF;
    *(unsigned long *)((unsigned char *)g_membuff + vsize - sizeof(unsigned long)) = 0xEF0000000C0000AA;

    kdbgprintln("Mapped 'kalloc' buffer" \
            "\n\t\tFirst  8 bytes: %lX" \
            "\n\t\tSecond 8 bytes: %lX" \
            "\n\t\tLast   8 bytes: %lX",
            *((unsigned long *)g_membuff),
            *((unsigned long *)g_membuff + 1),
            *(unsigned long *)((unsigned char *)g_membuff + vsize - sizeof(unsigned long)));

    return 0;
}

Test Application code:

static unsigned long map_mem_size;

int main(int argc, char** argv)
{
    int fd;
    const char dev_name[] = "/dev/chardev";
    void * address = NULL;
    long page_off = 0;
    cudaError_t cudarc;

    switch(argc)
    {
    case 2:
        page_off = atoi(argv[1]) * getpagesize();
        break;
    default:
        page_off = 0;
        break;
    }

    map_mem_size = 2 * getpagesize();

    printf("Opening %s file\n", dev_name);
    errno = 0;
    if(0 > (fd = open(dev_name, O_RDWR) ))
    {
        printf("Error %d - %s\n", errno, strerror(errno));
    }
    else
    {
        printf("About to map %lu bytes of %s device memory\n", map_mem_size, dev_name);

        errno = 0;
        if(MAP_FAILED == (address = mmap(NULL, map_mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, page_off)))
        {
            printf("Error %d - %s\n", errno, strerror(errno));
        }
        else
        {
            printf("mapped %s driver 'kmalloc' memory" \
                    "\n\t\tFirst  8 bytes : %lX" \
                    "\n\t\tSecond 8 bytes: %lX" \
                    "\n\t\tLast   8 bytes: %lX\n",
                    dev_name,
                    *((unsigned long *)address),
                    *((unsigned long *)address + 1),
                    *(unsigned long *)((unsigned char *)address + map_mem_size - sizeof(unsigned long)));

            if (cudaSuccess != (cudarc = cudaHostRegister(address, map_mem_size, cudaHostRegisterDefault)))
            {
                printf("Error: Failed cudaHostRegister: %s, address %p\n", cudaGetErrorString(cudarc), address);
            }
        }
    }

    /*Release resources block*/

    return EXIT_SUCCESS;
}

Run time debug information:

User space:

./chrdev_test 
Opening /dev/chardev file
About to map 8192 bytes of /dev/chardev device memory
mapped /dev/chardev driver 'kmalloc' memory
                First  8 bytes : CDFFFFFFFFFFFFAB
                Second 8 bytes: AB000000000000EF
                Last   8 bytes: EF0000000C0000AA
Error: Failed cudaHostRegister: invalid argument
Unmapping /dev/chardev file
Closing /dev/chardev file

Kernel space (tail -f /var/log/syslog):

 [ 4814.119537] [chardev] chardev.c, chdv_mmap, line 292:MEM_CHUNK_SIZE 4096, pages_per_buf 1, vsize 8192  vma->vm_pgoff 0
    [ 4814.119538] [chardev] chardev.c, chdv_mmap, line 311:PFN : 16306184
    [ 4814.119543] [chardev] chardev.c, chdv_mmap, line 330:Mapped 'kzalloced' buffer
    [ 4814.119543]           First  8 bytes: CDFFFFFFFFFFFFAB
    [ 4814.119543]           Second 8 bytes: AB000000000000EF
    [ 4814.119543]           Last   8 bytes: EF0000000C0000AA

Thanks ahead.

@ Robert Crovella That's the point! If you mmap any userland file it works just fine, the problem appears when I mmap kernel memory. — Yoelson, Jun 15 '17 at 15:20
indeed, I stand corrected, the `mmap` of a (ordinary) file then registering does appear to work — Robert Crovella, Jun 15 '17 at 16:36
@talonmies This is strange, as far as I know most of v4l2 drivers allocate memory in kernel, in case you are right, there is no way to boost their data via cudaHostRegister, in this case someone besides me would have experienced the same problem, but I don't find any recollections of that in the Net... — Yoelson, Jun 15 '17 at 18:33
@talonmies In any case, could you please be more specific i.e.: Why is it impossible? What is the alternative besides mapping userland allocated memory to the driver and to Cuda? — Yoelson, Jun 16 '17 at 08:18

score 1 · Answer 1 · answered Jun 27 '17 at 12:18

Made it work!

The full answer may be found in: https://devtalk.nvidia.com/default/topic/1014391/cuda-programming-and-performance/registering-mapped-linux-character-device-memory-with-cudahostregister-results-in-invalid-argument/?offset=3#5174771
There is a problem with memory chunks longer than 2 pages (> 8K) working with Cuda...

Thanks, Yoel.

Registering Mapped Linux Character Device Memory with cudaHostRegister Results in Invalid Argument

1 Answers1

Linked