I am testing out cuda graphs. My graph is as follows.
the code for this is as follows
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
#define NumThreads 20
#define NumBlocks 1
template <typename PtrType>
__global__ void kernel1(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
buffer[tid] = (PtrType)tid;
}
template <typename PtrType>
__global__ void kernel2(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < numElems/2) buffer[tid] += 5;
}
template <typename PtrType>
__global__ void kernel3(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid>=numElems/2) buffer[tid] *= 5;
}
template <typename PtrType>
void print(void *data) {
PtrType *buffer = (PtrType *)data;
std::cout << "[";
for (unsigned int i = 0; i < NumThreads; ++i) {
std::cout << buffer[i] << ",";
}
std::cout << "]\n";
}
void runCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream) {
cudaGraphInstantiate(&graphExec, Graph, nullptr, nullptr, 0);
cudaStreamCreateWithFlags(&graphStream, cudaStreamNonBlocking);
cudaGraphLaunch(graphExec, graphStream);
cudaStreamSynchronize(graphStream);
}
void destroyCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream) {
cudaCtxResetPersistingL2Cache();
cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(Graph);
cudaStreamDestroy(graphStream);
cudaDeviceReset();
}
template <typename PtrType>
void createCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream, PtrType *buffer,
unsigned int numElems, PtrType *hostBuffer) {
cudaGraphCreate(&Graph, 0);
cudaGraphNode_t Kernel1;
cudaKernelNodeParams nodeParams = {0};
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel1<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
void *inputs[2];
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = nullptr;
cudaGraphAddKernelNode(&Kernel1, Graph, nullptr, 0, &nodeParams);
cudaGraphNode_t Kernel2;
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel2<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = NULL;
cudaGraphAddKernelNode(&Kernel2, Graph, &Kernel1, 1, &nodeParams);
cudaGraphNode_t Kernel3;
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel3<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = NULL;
cudaGraphAddKernelNode(&Kernel3, Graph, &Kernel1, 1, &nodeParams);
cudaGraphNode_t copyBuffer;
std::vector<cudaGraphNode_t> dependencies = {Kernel2, Kernel3};
cudaGraphAddMemcpyNode1D(©Buffer, Graph,dependencies.data(),dependencies.size(),hostBuffer, buffer, numElems*sizeof(PtrType), cudaMemcpyDeviceToHost);
cudaGraphNode_t Host1;
cudaHostNodeParams hostNodeParams;
memset(&hostNodeParams, 0, sizeof(hostNodeParams));
hostNodeParams.fn = print<PtrType>;
hostNodeParams.userData = (void *)&hostBuffer;
cudaGraphAddHostNode(&Host1, Graph, ©Buffer, 1,
&hostNodeParams);
}
int main() {
cudaGraph_t graph;
cudaGraphExec_t graphExec;
cudaStream_t graphStream;
unsigned int numElems = NumThreads;
unsigned int bufferSizeBytes = numElems * sizeof(unsigned int);
unsigned int hostBuffer[numElems];
memset(hostBuffer, 0, bufferSizeBytes);
unsigned int *deviceBuffer;
cudaMalloc(&deviceBuffer, bufferSizeBytes);
createCudaGraph(graph, graphExec, graphStream, deviceBuffer,numElems, hostBuffer);
runCudaGraph(graph, graphExec, graphStream);
destroyCudaGraph(graph, graphExec, graphStream);
std::cout << "graph example done!" << std::endl;
}
When I run this example I get a result of [3593293488,22096,3561843129,22096,3561385808,22096,3593293488,22096,3598681264,22096,3561792984,22096,2687342880,0,0,0,3598597376,22096,3598599312,0,]
However I expect: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
I can't figure out where I went wrong. I used cuda-gdb and it seems right on the GPU. However, somewhere in the memCpy and sending to host function it goes wrong. Any ideas?