Error with a captured CUDA graph and asynchronous memory allocations in a loop

Question

I am trying to implement a cuda graph experiment. There are three kernels, kernel_0, kernel_1, and kernel_2. They will be executed sequentially and have dependencies. Right now I am going to only capture kernel_1. These are my code:


#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

#define N 50000
#define NSTEP 1000
#define NKERNEL 20

using namespace std::chrono;

static const char *_cudaGetErrorEnum(cudaError_t error) {
  return cudaGetErrorName(error);
}

template <typename T>
void check(T result, char const *const func, const char *const file,
           int const line) {
  if (result) {
    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}

#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)

__global__ void shortKernel_0(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        in_d[idx] = 1.0;
        out_d[idx]=1 + in_d[idx];
    }
}

__global__ void shortKernel_1(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) out_d[idx]=2*in_d[idx];
}

__global__ void shortKernel_2(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        out_d[idx]=3*in_d[idx];
    }
}

void test(){

    size_t size_bytes = N * sizeof(float);
    void * in_d_0;
    void * out_d_0;
    void * out_d_1;
    void * out_d_2;

    int threads = 128;
    int blocks = (N+threads)/threads;
    int iter = 10;
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    CUmemoryPool pool_;
    cuDeviceGetDefaultMemPool(&pool_, 0);
    uint64_t threshold = UINT64_MAX;
    cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);

    cudaGraph_t graph;
    cudaGraphExec_t instance;
    bool graphCreated=false;

    for (int i =0; i < iter; i++){
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
        shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
        if (!graphCreated){
            cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
            cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
            cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
            shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
            cudaStreamEndCapture(stream, &graph);
            checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
            checkCudaErrors(cudaGraphUpload(instance, stream));
            graphCreated = true;
        }else{
            checkCudaErrors(cudaGraphLaunch(instance, stream));
        }
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
        shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));
       
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
    }
   
    cudaDeviceSynchronize();        
    printf("With async malloc done!");
    cudaStreamDestroy(stream);
    cudaGraphDestroy(graph);
    cudaGraphExecDestroy(instance);
}

int main() {
    test();
    return 0;
}

The output from kernel_0 is consumed by kernel_1. and The output from kernel_1 is consumed by kernel_2. However, when I ran with compute-sanitizer, I got some errors. Any idea on this error? Part of error is attached:

========= Program hit CUDA_ERROR_INVALID_VALUE (error 1) due to "invalid argument" on CUDA API call to cuMemFreeAsync.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x2ef045]
=========                in /usr/local/cuda/compat/lib.real/libcuda.so.1
=========     Host Frame:test() [0xb221]
=========                in /opt/test-cudagraph/./a.out
=========     Host Frame:main [0xb4b3]
=========                in /opt/test-cudagraph/./a.out
=========     Host Frame:__libc_start_main [0x24083]
=========                in /usr/lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaf6e]
=========                in /opt/test-cudagraph/./a.out

You have not compiled the code with binary support for whatever GPU you are using — talonmies, Jul 23 '22 at 04:45
I don't get you. I compiled the code via ```nvcc test.cu -lcuda -gencode arch=compute_80,code=sm_80``` and executed the binary. When I ran with ```compute-sanitize ./a.out``` I got some errors. — kingwales, Jul 23 '22 at 05:16
Yes and the error is that you have not compiled the code with binary support for your GPU, i.e. you are not running the code on an sm_80 GPU — talonmies, Jul 23 '22 at 05:42
Thanks for pointing out. I rerun the test on A100. Though the binary can be executed. But If i ran with ```compute-sanitize```, there are bunches of errors. Could you please help me figure out if I use those APIs correctly. I have updated the question. — kingwales, Jul 23 '22 at 06:10
Does this answer your question? [CUDA Graph Problem: Results not computed for the first iteration](https://stackoverflow.com/questions/71618350/cuda-graph-problem-results-not-computed-for-the-first-iteration) — einpoklum, Jul 23 '22 at 16:37
Thanks all. I got you. As mentioned by @einpoklum, doing allocation/deallocation outside the loop might help this specific problem. But I wanted to integrate cudagraph for our internal framework. The whole neural network might be super large. We might insert the capture operations at random location. What's the best way to call these APIs. Should I execute this sub-network first(including allocation and free), then do the capture? — kingwales, Jul 23 '22 at 17:58
I am expecting answer for this, since this following-up question is not related to the post. I am just asking for potential suggestion. — kingwales, Jul 23 '22 at 17:59
@kingwales: Did you mean to write "I am not"? At any rate, if you ask a follow-up question, link to it from a comment on my answer so I can see it; or at-mention my username or Robert's, or both. — einpoklum, Jul 23 '22 at 18:09

score 2 · Accepted Answer · edited Jul 23 '22 at 18:55

1. Figuring out where the error occurs, exactly

To get the "idea", you need to wrap all of your API calls with error checks. Doing so properly is a bit tricky, since the cudaError_t runtime-API status type and the CUresult driver-API status type don't agree on all values, so you would need to overload the error-check function:

void check(cudaError_t result, char const *const func, 
   const char *const file, int const line) 
{
  if (result) {
    fprintf(stderr, "CUDA runtime error at %s:%d code=%d(%s) \"%s\" \n", 
    file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}

void check(CUresult result, char const *const func, 
   const char *const file, int const line) 
{
  if (result) {
    const char* error_name = "(UNKNOWN)";
    cuGetErrorName(result, &error_name);
    fprintf(stderr, "CUDA driver error at %s:%d code=%d(%s) \"%s\" \n", 
    file, line, static_cast<unsigned int>(result), error_name, func);
    exit(EXIT_FAILURE);
  }
}

when you then wrap all your calls with an error check, running the program gets you:

CUDA driver error at a.cu:102 code=1(CUDA_ERROR_INVALID_VALUE) "cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream)"

and the line triggering the error is:

checkCudaErrors(cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream));

i.e. the CUDA driver believes out_d_1 is not a valid device pointer for (asynchronous) freeing.

This was the easy part which isn't even that specific to your program.

2. The errors

There are two problems in your code:

On the first pass of your for loop, you capture the graph using stream capture. When capturing a graph this way, no actual work is done during the graph capture process. This means that on the first iteration of the for loop, this line cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream); does nothing. No allocation is performed. out_d_1 is not modified. However during that same for loop iteration, you attempt to free that pointer here: cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);, but on that particular for loop iteration it was never allocated. So the free fails. This explains the cuMemFreeAsync problem related to the usage here: cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
There is also a problem with the usage of cuMemFreeAsync during the capture process, specifically this line: cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream); We can see that the allocation for that item (in_d_0) that you are attempting to free during graph capture (i.e. during the graph execution) is allocated outside the graph. But this is a no-no. See the documentation for cuMemFreeAsync:

During stream capture, this function results in the creation of a free node and must therefore be passed the address of a graph allocation

3. What can you do about it?

Combining those two items, one possible way to fix your posted code is as follows:

$ cat t2068.cu
#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

#define N 50000
#define NSTEP 1000
#define NKERNEL 20

using namespace std::chrono;

static const char *_cudaGetErrorEnum(cudaError_t error) {
  return cudaGetErrorName(error);
}

template <typename T>
void check(T result, char const *const func, const char *const file,
           int const line) {
  if (result) {
    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}

#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)

__global__ void shortKernel_0(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        in_d[idx] = 1.0;
        out_d[idx]=1 + in_d[idx];
    }
}

__global__ void shortKernel_1(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) out_d[idx]=2*in_d[idx];
}

__global__ void shortKernel_2(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        out_d[idx]=3*in_d[idx];
    }
}

void test(){

    size_t size_bytes = N * sizeof(float);
    void * in_d_0;
    void * out_d_0;
    void * out_d_1;
    void * out_d_2;

    int threads = 128;
    int blocks = (N+threads)/threads;
    int iter = 10;
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    CUmemoryPool pool_;
    cuDeviceGetDefaultMemPool(&pool_, 0);
    uint64_t threshold = UINT64_MAX;
    cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);

    cudaGraph_t graph;
    cudaGraphExec_t instance;
    bool graphCreated=false;

    for (int i =0; i < iter; i++){
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
        shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
        // moved the next line outside of the graph region
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
        if (!graphCreated){
            cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
            cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
            //cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
            shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
            cudaStreamEndCapture(stream, &graph);
            checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
            checkCudaErrors(cudaGraphUpload(instance, stream));
            graphCreated = true;
        }
        // modified so that we run the instantiated graph on every iteration
        checkCudaErrors(cudaGraphLaunch(instance, stream));
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
        shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));

        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
    }

    cudaDeviceSynchronize();
    printf("With async malloc done!\n");
    cudaStreamDestroy(stream);
    cudaGraphDestroy(graph);
    cudaGraphExecDestroy(instance);
}

int main() {
    test();
    return 0;
}
$ nvcc -o t2068 t2068.cu -lcuda
$ compute-sanitizer ./t2068
========= COMPUTE-SANITIZER
With async malloc done!
========= ERROR SUMMARY: 0 errors
$

A reasonable question might be "If freeing a non-graph allocation is not allowed in a graph, why didn't graph capture fail?" I suspect the answer to that is that the graph capture mechanism is not able to determine at the point of graph capture whether your CUdeviceptr will contain an entity that was allocated during graph execution, or not.

You might also want to consider avoiding the de-allocation and re-allocation of other buffers. After all, the buffer sizes are constant over all iterations.

Some observations about this stream ordered memory allocation in graphs:

an item allocated outside the graph cannot be freed in the graph
an item allocated in the graph can be freed in the graph
an item allocated in the graph need not be freed immediately at the end of graph execution, it can be freed later (in non-graph code, as is demonstrated here)
an item allocated in a graph should be freed before the graph attempts to allocate it again, but also specifically, before the graph is launched again. Hopefully the reasons for this are obvious; it would be a typical memory leak. However you may get a graph runtime error if you forget this. You can use a control at graph instantiation to auto-free such allocations at the graph launch point:

If any allocations created by [the graph being launched] remain unfreed ... and hGraphExec was not instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, the launch will fail with CUDA_ERROR_INVALID_VALUE.

Error with a captured CUDA graph and asynchronous memory allocations in a loop

1 Answers1

1. Figuring out where the error occurs, exactly

2. The errors

3. What can you do about it?