What's the capacity of a CUDA stream (=queue)?

Question

A CUDA stream is a queue of tasks: memory copies, event firing, event waits, kernel launches, callbacks...

But - these queues don't have infinite capacity. In fact, empirically, I find that this limit is not super-high, e.g. in the thousands, not millions.

My questions:

Is the size/capacity of a CUDA stream fixed in terms of any kind of enqueued items, or does the capacity behave differently based on what kind of actions/tasks you enqueue?
How can I determine this capacity other than enqueuing more and more stuff until I can no longer fit any?

I think they have different sizes on device and host. Device should be able to hold less kernel bytes than host if graphics card has 2GB memory and host has 4GB memory. If each kernel metadata is a pinned buffer in the queue of stream, then system resources may let you enqueue only thousands of pinned buffers, not whole memory. I'm assuming CUDA's developers optimized kernel metadata by always pinning their buffer and some other optimizations (like aligning to 4096) that may not be as available as a simple malloc call. How many page-aligned buffers can exist in fragmented memory? — huseyin tugrul buyukisik, Jun 25 '22 at 07:11
@huseyintugrulbuyukisik: I'm interacting with the host-side queue; whatever is on the device is an interesting implementation detail, but not what I'm after - which is when you can no longer enqueue and have to wait for space to become available (and that's the host side). — einpoklum, Jun 25 '22 at 07:42
From what I gather there seems to be a rather small limit of ~24 tasks per stream after which it will effectively block. I couldn't find any authoritative source for this, though. — Homer512, Jun 25 '22 at 11:46
@Homer512: I'm seeing at least a couple of hundred tasks and maybe more. — einpoklum, Jun 25 '22 at 14:03

score 2 · Answer 1 · answered Jul 05 '22 at 20:59

Is the size/capacity of a CUDA stream fixed in terms of any kind of enqueued items, or does the capacity behave differently based on what kind of actions/tasks you enqueue?

The "capacity" behaves differently based on actions/tasks you enqueue.

Here is a demonstration:

If we enqueue a single host function/callback in the midst of a number of kernel calls, on a Tesla V100 on CUDA 11.4 I observe a "capacity" for ~1000 enqueued items. However if I alternate kernel calls and host functions, I observe a capacity for ~100 enqueued items.

// test case with alternating kernels and callbacks

$ cat t2042a.cu
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>

#define CUDACHECK(x) x
// empty kernel
__global__ void NoOpKernel() {}

// for blocking stream to wait for host signal
class Event {
 private:
  std::mutex mtx_condition_;
  std::condition_variable condition_;
  bool signalled = false;

 public:
  void Signal() {
    {
      std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
      signalled = true;
    }
    condition_.notify_all();
  }

  void Wait() {
    std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
    while (!signalled) {
      condition_.wait(lock);
    }
  }
};

void CUDART_CB block_op_host_fn(void* arg) {
  Event* evt = (Event*)arg;
  evt->Wait();
}

int main() {
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreate(&stream));

  int num_events = 60; // 50 is okay, 60 will hang
  std::vector<std::shared_ptr<Event>> event_vec;

  for (int i = 0; i < num_events; i++) {
    std::cout << "Queuing NoOp " << i << std::endl;
    NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
    std::cout << "Queued NoOp " << i << std::endl;

    event_vec.push_back(std::make_shared<Event>());
    cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());

    std::cout << "Queued block_op " << i << std::endl;
  }


  for (int i = 0; i < num_events; i++) {
    event_vec[i]->Signal();
  }

  // clean up
  CUDACHECK(cudaDeviceSynchronize());
  CUDACHECK(cudaStreamDestroy(stream));
  return 0;
}
$ nvcc -o t2042a t2042a.cu
$ ./t2042a
Queuing NoOp 0
Queued NoOp 0
Queued block_op 0
Queuing NoOp 1
Queued NoOp 1
Queued block_op 1
Queuing NoOp 2
Queued NoOp 2
Queued block_op 2
Queuing NoOp 3
Queued NoOp 3
Queued block_op 3
Queuing NoOp 4
Queued NoOp 4
Queued block_op 4
Queuing NoOp 5
Queued NoOp 5
Queued block_op 5
Queuing NoOp 6
Queued NoOp 6
Queued block_op 6
Queuing NoOp 7
Queued NoOp 7
Queued block_op 7
Queuing NoOp 8
Queued NoOp 8
Queued block_op 8
Queuing NoOp 9
Queued NoOp 9
Queued block_op 9
Queuing NoOp 10
Queued NoOp 10
Queued block_op 10
Queuing NoOp 11
Queued NoOp 11
Queued block_op 11
Queuing NoOp 12
Queued NoOp 12
Queued block_op 12
Queuing NoOp 13
Queued NoOp 13
Queued block_op 13
Queuing NoOp 14
Queued NoOp 14
Queued block_op 14
Queuing NoOp 15
Queued NoOp 15
Queued block_op 15
Queuing NoOp 16
Queued NoOp 16
Queued block_op 16
Queuing NoOp 17
Queued NoOp 17
Queued block_op 17
Queuing NoOp 18
Queued NoOp 18
Queued block_op 18
Queuing NoOp 19
Queued NoOp 19
Queued block_op 19
Queuing NoOp 20
Queued NoOp 20
Queued block_op 20
Queuing NoOp 21
Queued NoOp 21
Queued block_op 21
Queuing NoOp 22
Queued NoOp 22
Queued block_op 22
Queuing NoOp 23
Queued NoOp 23
Queued block_op 23
Queuing NoOp 24
Queued NoOp 24
Queued block_op 24
Queuing NoOp 25
Queued NoOp 25
Queued block_op 25
Queuing NoOp 26
Queued NoOp 26
Queued block_op 26
Queuing NoOp 27
Queued NoOp 27
Queued block_op 27
Queuing NoOp 28
Queued NoOp 28
Queued block_op 28
Queuing NoOp 29
Queued NoOp 29
Queued block_op 29
Queuing NoOp 30
Queued NoOp 30
Queued block_op 30
Queuing NoOp 31
Queued NoOp 31
Queued block_op 31
Queuing NoOp 32
Queued NoOp 32
Queued block_op 32
Queuing NoOp 33
Queued NoOp 33
Queued block_op 33
Queuing NoOp 34
Queued NoOp 34
Queued block_op 34
Queuing NoOp 35
Queued NoOp 35
Queued block_op 35
Queuing NoOp 36
Queued NoOp 36
Queued block_op 36
Queuing NoOp 37
Queued NoOp 37
Queued block_op 37
Queuing NoOp 38
Queued NoOp 38
Queued block_op 38
Queuing NoOp 39
Queued NoOp 39
Queued block_op 39
Queuing NoOp 40
Queued NoOp 40
Queued block_op 40
Queuing NoOp 41
Queued NoOp 41
Queued block_op 41
Queuing NoOp 42
Queued NoOp 42
Queued block_op 42
Queuing NoOp 43
Queued NoOp 43
Queued block_op 43
Queuing NoOp 44
Queued NoOp 44
Queued block_op 44
Queuing NoOp 45
Queued NoOp 45
Queued block_op 45
Queuing NoOp 46
Queued NoOp 46
Queued block_op 46
Queuing NoOp 47
Queued NoOp 47
Queued block_op 47
Queuing NoOp 48
Queued NoOp 48
Queued block_op 48
Queuing NoOp 49
Queued NoOp 49
Queued block_op 49
Queuing NoOp 50
Queued NoOp 50
Queued block_op 50
Queuing NoOp 51
Queued NoOp 51
Queued block_op 51
Queuing NoOp 52
Queued NoOp 52
Queued block_op 52
Queuing NoOp 53
Queued NoOp 53
Queued block_op 53
Queuing NoOp 54
Queued NoOp 54
Queued block_op 54
Queuing NoOp 55
Queued NoOp 55
Queued block_op 55
Queuing NoOp 56
Queued NoOp 56
Queued block_op 56
Queuing NoOp 57
^C
$

// test case with a single callback and many kernels

$ cat t2042.cu
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>
#include <cstdlib>
#define CUDACHECK(x) x
// empty kernel
__global__ void NoOpKernel() {}

// for blocking stream to wait for host signal
class Event {
 private:
  std::mutex mtx_condition_;
  std::condition_variable condition_;
  bool signalled = false;

 public:
  void Signal() {
    {
      std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
      signalled = true;
    }
    condition_.notify_all();
  }

  void Wait() {
    std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
    while (!signalled) {
      condition_.wait(lock);
    }
  }
};

void CUDART_CB block_op_host_fn(void* arg) {
  Event* evt = (Event*)arg;
  evt->Wait();
}

int main(int argc, char *argv[]) {
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreate(&stream));

  int num_loops = 2000; // 50 is okay, 60 will hang
  int num_events = 0;
  std::vector<std::shared_ptr<Event>> event_vec;
  if (argc > 1) num_loops = atoi(argv[1]);

  for (int i = 0; i < num_loops; i++) {
    std::cout << "Queuing NoOp " << i << std::endl;
    NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
    std::cout << "Queued NoOp " << i << std::endl;
    if (i == 0){
      num_events++;
      event_vec.push_back(std::make_shared<Event>());
      cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());

      std::cout << "Queued block_op " << i << std::endl;}
  }


  for (int i = 0; i < num_events; i++) {
    event_vec[i]->Signal();
  }

  // clean up
  CUDACHECK(cudaDeviceSynchronize());
  CUDACHECK(cudaStreamDestroy(stream));
  return 0;
}
$ nvcc -o t2042 t2042.cu
$ nvcc -o t2042 t2042.cu
$ ./t2042
... <snip>
Queuing NoOp 1019
Queued NoOp 1019
Queuing NoOp 1020
Queued NoOp 1020
Queuing NoOp 1021
Queued NoOp 1021
Queuing NoOp 1022
^C
$

(the code hangs when the queue becomes "full", and I terminate at that point with ctrl-C)

How can I determine this capacity other than enqueuing more and more stuff until I can no longer fit any?

Currently, there is no specification for this in CUDA, nor any explicit method to query for this at runtime.

What's the capacity of a CUDA stream (=queue)?

1 Answers1

Linked