0

Consider the following program (written in C syntax):

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

int main() {
    CUresult result;
    unsigned int init_flags = 0;
    result = cuInit(init_flags);
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    CUcontext ctx;
    unsigned int ctx_create_flags = 0;
    CUdevice device_id = 0;
    result = cuCtxCreate(&ctx, ctx_create_flags, device_id);
    // Note: The created context is also made the current context,
    // so we are _in_ a context from now on.
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    CUdeviceptr requested = 0;
    CUdeviceptr reserved;
    size_t size = 0x20000;
    size_t alignment = 0; // default
    unsigned long long reserve_flags = 0;

    // -----------------------------------
    // ==>> FAILURE on next statement <<==
    // -----------------------------------

    result = cuMemAddressReserve(&reserved, size, alignment, requested, reserve_flags);
    if (result != CUDA_SUCCESS) {
        const char* error_string;
        cuGetErrorString(result, &error_string);
        fprintf(stderr, "cuMemAddressReserve() failed: %s\n", error_string);
        exit(EXIT_FAILURE);
    }
    return 0;
}

This fails when trying to make the reservation:

cuMemAddressReserve() failed: invalid argument

what's wrong with my arguments? Is it the size? the alignment? Requesting an address of 0? If it's the latter - how can I even know what address to request, when I don't really care?

einpoklum
  • 118,144
  • 57
  • 340
  • 684

2 Answers2

3

If I recall correctly ,the sizes for virtual memory management functions must be a multiple of CUDAs allocation granularity. See cuMemGetAllocationGranularity and this blog post https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/

The following works on my machine.

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

int main() {
    CUresult result;
    unsigned int init_flags = 0;
    result = cuInit(init_flags);
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    CUcontext ctx;
    unsigned int ctx_create_flags = 0;
    CUdevice device_id = 0;
    result = cuCtxCreate(&ctx, ctx_create_flags, device_id);
    // Note: The created context is also made the current context,
    // so we are _in_ a context from now on.
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    CUdeviceptr requested = 0;
    CUdeviceptr reserved;
    size_t size = 0x20000;
    size_t alignment = 0; // default
    unsigned long long reserve_flags = 0;

    size_t granularity;
    CUmemAllocationProp prop;
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = (int)0;
    prop.win32HandleMetaData = NULL;
    result = cuMemGetAllocationGranularity (&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM );
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    printf("minimum granularity %lu\n", granularity);


    size_t padded_size = ((granularity + size - 1) / granularity) * granularity;
    result = cuMemAddressReserve(&reserved, padded_size, alignment, requested, reserve_flags);
    if (result != CUDA_SUCCESS) {
        const char* error_string;
        cuGetErrorString(result, &error_string);
        fprintf(stderr, "cuMemAddressReserve() failed: %s\n", error_string);
        exit(EXIT_FAILURE);
    }
    return 0;
}
Abator Abetor
  • 2,345
  • 1
  • 10
  • 12
  • Abetor: I actually got that figure using granuality limits. But - come on, we're talking about 2^21. How can that not be a multiple of the granuality? Anyway, I'll recheck. – einpoklum Jan 03 '22 at 13:53
  • Also - why does the reservation size have to be a multiple of the allocation granularity on some device? It's not like I'm mapping it anywhere for now. Or maybe there are different devices with different granularities? Grrr. – einpoklum Jan 03 '22 at 14:21
  • Anyway, upvoted you and thanks. – einpoklum Jan 03 '22 at 14:37
-3

tl;dr: Your reserved region size is not a multiple of (some device's) allocation granularity.

As @AbatorAbetor suggested, cuMemAddressReserve() implicitly requires the size of the memory region to be a multiple of some granularity value. And despite 0x20000 seeming like a generous enough value for that (2^21 bytes ... system memory pages are typically 4 KiB = 2^12 bytes) - NVIDIA GPUs are very demanding here.

For example, a Pascal GTX 1050 Ti GPU with ~4GB of memory has a granularity of 0x200000, or 2 MiB - 16 times more than what you were trying to allocate.

Now, what would happen if we had two devices with different granularity values? Would we need to use the least-common-multiple? Who knows.

Anyway, bottom line: Always check the granularity both before allocating and before reserving.

I have filed this as a documentation bug with NVIDIA, bug 3486420 (but you may not be able to follow the link, because NVIDIA hide their bugs from their users).

einpoklum
  • 118,144
  • 57
  • 340
  • 684