I am trying to implement a cuda graph experiment. There are three kernels, kernel_0, kernel_1, and kernel_2. They will be executed sequentially and have dependencies. Right now I am going to only capture kernel_1. These are my code:
#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#define N 50000
#define NSTEP 1000
#define NKERNEL 20
using namespace std::chrono;
static const char *_cudaGetErrorEnum(cudaError_t error) {
return cudaGetErrorName(error);
}
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
__global__ void shortKernel_0(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) {
in_d[idx] = 1.0;
out_d[idx]=1 + in_d[idx];
}
}
__global__ void shortKernel_1(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=2*in_d[idx];
}
__global__ void shortKernel_2(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) {
out_d[idx]=3*in_d[idx];
}
}
void test(){
size_t size_bytes = N * sizeof(float);
void * in_d_0;
void * out_d_0;
void * out_d_1;
void * out_d_2;
int threads = 128;
int blocks = (N+threads)/threads;
int iter = 10;
cudaStream_t stream;
cudaStreamCreate(&stream);
CUmemoryPool pool_;
cuDeviceGetDefaultMemPool(&pool_, 0);
uint64_t threshold = UINT64_MAX;
cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);
cudaGraph_t graph;
cudaGraphExec_t instance;
bool graphCreated=false;
for (int i =0; i < iter; i++){
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
if (!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
cudaStreamEndCapture(stream, &graph);
checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphUpload(instance, stream));
graphCreated = true;
}else{
checkCudaErrors(cudaGraphLaunch(instance, stream));
}
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
}
cudaDeviceSynchronize();
printf("With async malloc done!");
cudaStreamDestroy(stream);
cudaGraphDestroy(graph);
cudaGraphExecDestroy(instance);
}
int main() {
test();
return 0;
}
The output from kernel_0 is consumed by kernel_1. and The output from kernel_1 is consumed by kernel_2.
However, when I ran with compute-sanitizer
, I got some errors. Any idea on this error? Part of error is attached:
========= Program hit CUDA_ERROR_INVALID_VALUE (error 1) due to "invalid argument" on CUDA API call to cuMemFreeAsync.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x2ef045]
========= in /usr/local/cuda/compat/lib.real/libcuda.so.1
========= Host Frame:test() [0xb221]
========= in /opt/test-cudagraph/./a.out
========= Host Frame:main [0xb4b3]
========= in /opt/test-cudagraph/./a.out
========= Host Frame:__libc_start_main [0x24083]
========= in /usr/lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0xaf6e]
========= in /opt/test-cudagraph/./a.out