simple cuda graph example doesn't product expected result

Question

I am testing out cuda graphs. My graph is as follows.

the code for this is as follows

#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>

#define NumThreads 20
#define NumBlocks 1



template <typename PtrType>
__global__ void kernel1(PtrType *buffer, unsigned int numElems) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  buffer[tid] = (PtrType)tid;
}

template <typename PtrType>
__global__ void kernel2(PtrType *buffer, unsigned int numElems) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if(tid < numElems/2) buffer[tid] += 5;
}

template <typename PtrType>
__global__ void kernel3(PtrType *buffer, unsigned int numElems) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if(tid>=numElems/2) buffer[tid] *= 5;
}

template <typename PtrType> 
void print(void *data) {
    PtrType *buffer = (PtrType *)data;
    std::cout << "[";
    for (unsigned int i = 0; i < NumThreads; ++i) {
        std::cout << buffer[i] << ",";
    }
    std::cout << "]\n";
  }

void runCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
                  cudaStream_t &graphStream) {
  cudaGraphInstantiate(&graphExec, Graph, nullptr, nullptr, 0);
  cudaStreamCreateWithFlags(&graphStream, cudaStreamNonBlocking);
  cudaGraphLaunch(graphExec, graphStream);
  cudaStreamSynchronize(graphStream);
}

void destroyCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
                      cudaStream_t &graphStream) {
  cudaCtxResetPersistingL2Cache();

  cudaGraphExecDestroy(graphExec);
  cudaGraphDestroy(Graph);
  cudaStreamDestroy(graphStream);
  cudaDeviceReset();
}

template <typename PtrType>
void createCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
                     cudaStream_t &graphStream, PtrType *buffer,
                     unsigned int numElems, PtrType *hostBuffer) {
  cudaGraphCreate(&Graph, 0);

  cudaGraphNode_t Kernel1;
  cudaKernelNodeParams nodeParams = {0};
  memset(&nodeParams, 0, sizeof(nodeParams));
  nodeParams.func = (void *)kernel1<PtrType>;
  nodeParams.gridDim = dim3(NumBlocks, 1, 1);
  nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
  nodeParams.sharedMemBytes = 0;
  void *inputs[2];
  inputs[0] = (void *)&buffer;
  inputs[1] = (void *)&numElems;
  nodeParams.kernelParams = inputs;
  nodeParams.extra = nullptr;

  
  cudaGraphAddKernelNode(&Kernel1, Graph, nullptr, 0, &nodeParams);

  cudaGraphNode_t Kernel2;
  memset(&nodeParams, 0, sizeof(nodeParams));
  nodeParams.func = (void *)kernel2<PtrType>;
  nodeParams.gridDim = dim3(NumBlocks, 1, 1);
  nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
  nodeParams.sharedMemBytes = 0;
  inputs[0] = (void *)&buffer;
  inputs[1] = (void *)&numElems;
  nodeParams.kernelParams = inputs;
  nodeParams.extra = NULL;

  cudaGraphAddKernelNode(&Kernel2, Graph, &Kernel1, 1, &nodeParams);

  cudaGraphNode_t Kernel3;
  memset(&nodeParams, 0, sizeof(nodeParams));
  nodeParams.func = (void *)kernel3<PtrType>;
  nodeParams.gridDim = dim3(NumBlocks, 1, 1);
  nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
  nodeParams.sharedMemBytes = 0;
  inputs[0] = (void *)&buffer;
  inputs[1] = (void *)&numElems;
  nodeParams.kernelParams = inputs;
  nodeParams.extra = NULL;

  cudaGraphAddKernelNode(&Kernel3, Graph, &Kernel1, 1, &nodeParams);


  cudaGraphNode_t copyBuffer;
  std::vector<cudaGraphNode_t> dependencies = {Kernel2, Kernel3};
  cudaGraphAddMemcpyNode1D(&copyBuffer, Graph,dependencies.data(),dependencies.size(),hostBuffer, buffer, numElems*sizeof(PtrType), cudaMemcpyDeviceToHost);

  cudaGraphNode_t Host1;
  cudaHostNodeParams hostNodeParams;
  memset(&hostNodeParams, 0, sizeof(hostNodeParams));
  hostNodeParams.fn = print<PtrType>;
  hostNodeParams.userData = (void *)&hostBuffer;
  cudaGraphAddHostNode(&Host1, Graph, &copyBuffer, 1,
                       &hostNodeParams);
}

int main() {
  cudaGraph_t graph;
  cudaGraphExec_t graphExec;
  cudaStream_t graphStream;

  unsigned int numElems = NumThreads;
  unsigned int bufferSizeBytes = numElems * sizeof(unsigned int);
  unsigned int hostBuffer[numElems];
  memset(hostBuffer, 0, bufferSizeBytes);
  unsigned int *deviceBuffer;
  cudaMalloc(&deviceBuffer, bufferSizeBytes);
  createCudaGraph(graph, graphExec, graphStream, deviceBuffer,numElems, hostBuffer);
  runCudaGraph(graph, graphExec, graphStream);
  destroyCudaGraph(graph, graphExec, graphStream);
  std::cout << "graph example done!" << std::endl;
}

When I run this example I get a result of [3593293488,22096,3561843129,22096,3561385808,22096,3593293488,22096,3598681264,22096,3561792984,22096,2687342880,0,0,0,3598597376,22096,3598599312,0,]

However I expect: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]

I can't figure out where I went wrong. I used cuda-gdb and it seems right on the GPU. However, somewhere in the memCpy and sending to host function it goes wrong. Any ideas?

score 3 · Answer 1 · answered Nov 04 '22 at 19:27

You are not passing the correct pointer to the host callback.

void createCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
                     cudaStream_t &graphStream, PtrType *buffer,
                     unsigned int numElems, PtrType *hostBuffer) {
   ...
   hostNodeParams.userData = (void *)&hostBuffer;
}

This takes the address of a stack variable which is no longer valid when the host function is called. Since hostBuffer already points to the array you want to print, you can just pass this pointer directly.

hostNodeParams.userData = (void *)hostBuffer;

That aside, I would like to mention that there is a handy function cudaGraphDebugDotPrint which can output a graph to file that can be converted to png to help with debugging. With your original code, it will show that the pointer used as memcpy destination and the pointer passed to the host function are different.

simple cuda graph example doesn't product expected result

1 Answers1