0

My understanding of cudaGraphInstantiateFlagUseNodePriority is to prioritize the kernel calls. i.e. we have three independent kernels in cudaGraph first, second & third, and let each kernel waits for 1s and print its name.

If we set kernel graph node priority using cudaGraphKernelNodeSetAttribute (attr-name - cudaLaunchAttributePriority) for each as 0, 1, 2. When the graph is executed, it should honor priority i.e. third should be called followed by second, and followed by first.

Another thing to note is that after setting priority or kernel graph node If I try to confirm using cudaGraphKernelNodeGet Attribute (attr-name - cudaLaunchAttributePriority), I always get priority as 0. It should return the same value set by the Set call as mentioned previously … Right? Please correct me if I am wrong.

I have tried the below sample to understand behavior.

#include <cuda_runtime.h>
#include <vector>
#include <cstdio>
#include <chrono>

#define CUDA_CHECK(error)                                                                            \
    {                                                                                              \
        cudaError_t localError = error;                                                             \
        if (localError != cudaSuccess) {      \
           printf("error: '%s'(%d) from %s at %s:%d\n",  cudaGetErrorString(localError),   \
                   localError, #error, __FUNCTION__, __LINE__);                              \
                exit(0);\
        }                                                                                          \
    }

__global__ void first(uint32_t interval, const uint32_t ticks_per_ms) {
  size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
  if (globaltid == 0) {
    printf("\nfirst..");
    while (interval--) {
      uint64_t start = clock64();
      while (clock64() - start < ticks_per_ms) {
      }
    }
    printf("first\n");
  }
}
__global__ void second(uint32_t interval, const uint32_t ticks_per_ms) {
  size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
  if (globaltid == 0) {
    printf("\nsecond..");
    while (interval--) {
      uint64_t start = clock64();
      while (clock64() - start < ticks_per_ms) {
      }
    }
    printf("second\n");
  }
}
__global__ void third(uint32_t interval, const uint32_t ticks_per_ms) {
  size_t globaltid = blockIdx.x * blockDim.x + threadIdx.x;
  if (globaltid == 0) {
    printf("\nthird..");
    while (interval--) {
      uint64_t start = clock64();
      while (clock64() - start < ticks_per_ms) {
      }
    }
    printf("third\n");
  }
}

void cudaGraphsManual() {
  cudaStream_t streamForGraph;
  cudaGraph_t graph;
  cudaGraphNode_t kernelNode;
  CUDA_CHECK(cudaStreamCreate(&streamForGraph));
  cudaKernelNodeParams kernelNodeParams = {0};
  CUDA_CHECK(cudaGraphCreate(&graph, 0));

  int ticks_per_ms = 0;
  CUDA_CHECK(cudaDeviceGetAttribute(&ticks_per_ms, cudaDevAttrClockRate, 0));
  uint32_t interval = std::chrono::milliseconds(1000).count();

  void *kernelArgs[2] = {&interval,
                         &ticks_per_ms};

  kernelNodeParams.func = (void *)first;
  kernelNodeParams.gridDim = dim3(1, 1, 1);
  kernelNodeParams.blockDim = dim3(1, 1, 1);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = kernelArgs;
  kernelNodeParams.extra = NULL;

  CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelNodeParams));
  union cudaKernelNodeAttrValue p1; p1.priority = 0;
  CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p1));
  
  union cudaKernelNodeAttrValue p4;
  CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
  printf("\nPriority : %d\n", p4.priority);
  
  kernelNodeParams.func = (void *)second;
  CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph,  NULL, 0, &kernelNodeParams));
  union cudaKernelNodeAttrValue p2; p2.priority = 2;
  CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p2));

  CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
  printf("\nPriority : %d\n", p4.priority);

  kernelNodeParams.func = (void *)third;
  CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph,  NULL, 0, &kernelNodeParams));
  union cudaKernelNodeAttrValue p3; p3.priority = 1;
  CUDA_CHECK(cudaGraphKernelNodeSetAttribute(kernelNode, cudaLaunchAttributePriority, &p3));
  
  CUDA_CHECK(cudaGraphKernelNodeGetAttribute(kernelNode, cudaLaunchAttributePriority, &p4));
  printf("\nPriority : %d\n", p4.priority);
  
  cudaGraphExec_t graphExec;
  CUDA_CHECK(cudaGraphInstantiateWithFlags(&graphExec, graph, cudaGraphInstantiateFlagUseNodePriority));
  CUDA_CHECK(cudaGraphLaunch(graphExec, streamForGraph));
  CUDA_CHECK(cudaStreamSynchronize(streamForGraph));
  CUDA_CHECK(cudaGraphExecDestroy(graphExec));
  CUDA_CHECK(cudaGraphDestroy(graph));
  CUDA_CHECK(cudaStreamDestroy(streamForGraph));
}

int main(int argc, char **argv) {
  cudaGraphsManual();
  return EXIT_SUCCESS;
}
Veeresh Devireddy
  • 1,057
  • 12
  • 24

1 Answers1

1

My understanding of cudaGraphInstantiateFlagUseNodePriority is to prioritize kernel calls.

It should probably be thought of as an analog of CUDA stream priorities.

for each as 0, 1, 2.

That is evidently not what you want to choose for priorities (see below). Use the stream priority mechanism to find an appropriate range of priorities, rather than choosing your own numbers/range arbitrarily.

When graph is executed then it should honor priority i.e. third should be called followed by second and followed by first.

That is not how stream priority works. Kernels may still begin execution in the order in which they were launched. However stream priority suggests that the CUDA block scheduler will preferentially choose blocks from higher priority streams over lower priority streams, when choosing blocks to deposit on available SMs. This is more-or-less meaningless in your case because:

  1. Each kernel launch consists of only 1 block.
  2. All of your kernels can run concurrently anyway. The block scheduler is free to deposit the block of each kernel as soon as it is available.

I always get priority as 0. It should return same value set by Set call as mentioned previously … Right?

Not if you request an invalid stream priority level. 0 is evidently valid. The others evidently are not.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Sample is just to understand use of cudaGraphInstantiateFlagUseNodePriority. as per CUDA doc for cudaGraphInstantiateFlagUseNodePriority Run the graph using the per-node priority attributes rather than the priority of the stream it is launched into. Does it not meant to have priority for kernel graph node on top of stream priority on which graph is launched? Is there CUDA sample/article on cudaGraphInstantiateFlagUseNodePriority to understand it more? I could not gather much on cudaGraphInstantiateFlagUseNodePriority. Thank you! – user9684153 Aug 23 '23 at 08:57
  • I've responded to some of this in your cross posting [here](https://forums.developer.nvidia.com/t/behavior-of-cudagraphinstantiateflagusenodepriority/263842/8). – Robert Crovella Aug 23 '23 at 13:20