My understanding of cudaGraphInstantiateFlagUseNodePriority is to prioritize the kernel calls. i.e. we have three independent kernels in cudaGraph first, second & third, and let each kernel waits for 1s and print its name.
If we set kernel graph node priority using cudaGraphKernelNodeSetAttribute (attr-name - cudaLaunchAttributePriority) for each as 0, 1, 2. When the graph is executed, it should honor priority i.e. third should be called followed by second, and followed by first.
Another thing to note is that after setting priority or kernel graph node If I try to confirm using cudaGraphKernelNodeGet Attribute (attr-name - cudaLaunchAttributePriority), I always get priority as 0. It should return the same value set by the Set call as mentioned previously … Right? Please correct me if I am wrong.
I have tried the below sample to understand behavior.
#include <cuda_runtime.h>
#include <vector>
#include <cstdio>
#include <chrono>
#define CUDA_CHECK(error) \
{ \
cudaError_t localError = error; \
if (localError != cudaSuccess) { \
printf("error: '%s'(%d) from %s at %s:%d\n", cudaGetErrorString(localError), \
localError, #error, __FUNCTION__, __LINE__); \
exit(0);\
} \
}
__global__ void first(uint32_t interval, const uint32_t ticks_per_ms) {
size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
if (globaltid == 0) {
printf("\nfirst..");
while (interval--) {
uint64_t start = clock64();
while (clock64() - start < ticks_per_ms) {
}
}
printf("first\n");
}
}
__global__ void second(uint32_t interval, const uint32_t ticks_per_ms) {
size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
if (globaltid == 0) {
printf("\nsecond..");
while (interval--) {
uint64_t start = clock64();
while (clock64() - start < ticks_per_ms) {
}
}
printf("second\n");
}
}
__global__ void third(uint32_t interval, const uint32_t ticks_per_ms) {
size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
if (globaltid == 0) {
printf("\nthird..");
while (interval--) {
uint64_t start = clock64();
while (clock64() - start < ticks_per_ms) {
}
}
printf("third\n");
}
}
void cudaGraphsManual() {
cudaStream_t streamForGraph;
cudaGraph_t graph;
cudaGraphNode_t kernelNode;
CUDA_CHECK(cudaStreamCreate(&streamForGraph));
cudaKernelNodeParams kernelNodeParams = {0};
CUDA_CHECK(cudaGraphCreate(&graph, 0));
int ticks_per_ms = 0;
CUDA_CHECK(cudaDeviceGetAttribute(&ticks_per_ms, cudaDevAttrClockRate, 0));
uint32_t interval = std::chrono::milliseconds(1000).count();
void *kernelArgs[2] = {&interval,
&ticks_per_ms};
kernelNodeParams.func = (void *)first;
kernelNodeParams.gridDim = dim3(1, 1, 1);
kernelNodeParams.blockDim = dim3(1, 1, 1);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = kernelArgs;
kernelNodeParams.extra = NULL;
CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelNodeParams));
union cudaKernelNodeAttrValue p1; p1.priority = 0;
CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p1));
union cudaKernelNodeAttrValue p4;
CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
printf("\nPriority : %d\n", p4.priority);
kernelNodeParams.func = (void *)second;
CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelNodeParams));
union cudaKernelNodeAttrValue p2; p2.priority = 2;
CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p2));
CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
printf("\nPriority : %d\n", p4.priority);
kernelNodeParams.func = (void *)third;
CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelNodeParams));
union cudaKernelNodeAttrValue p3; p3.priority = 1;
CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p3));
CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
printf("\nPriority : %d\n", p4.priority);
cudaGraphExec_t graphExec;
CUDA_CHECK(cudaGraphInstantiateWithFlags(&graphExec, graph, cudaGraphInstantiateFlagUseNodePriority));
CUDA_CHECK(cudaGraphLaunch(graphExec, streamForGraph));
CUDA_CHECK(cudaStreamSynchronize(streamForGraph));
CUDA_CHECK(cudaGraphExecDestroy(graphExec));
CUDA_CHECK(cudaGraphDestroy(graph));
CUDA_CHECK(cudaStreamDestroy(streamForGraph));
}
int main(int argc, char **argv) {
cudaGraphsManual();
return EXIT_SUCCESS;
}