0

I am trying to make a CUDA stream instance automatically delete itself once all its usages have been removed and I was wondering if when calling cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), if it is created the object on the heap or not? (I am assuming it is but I am not sure).

In the end I want to do something like this:


struct CUDAStreamDeleter {
    void operator()(cudaStream_t* p) const
    {
        cudaStreamDestroy(*p);
    }
};

int main() {
    int numberOfStreams = 4;
    vector<shared_ptr<cudaStream_t>> streams(numberOfStreams);

    for (size_t i = 0; i < numberOfStreams; ++i)
    {
        cudaStream_t stream;
        cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

        streams[i] = shared_ptr<cudaStream_t>(&stream, CUDAStreamDeleter());
    }
}

Edit:

As per @wohlstad a better template would be:

class StreamWrapper {
    public:
        StreamWrapper() {
            gpuErrchk(cudaStreamCreateWithFlags(&this->stream, cudaStreamNonBlocking));
        }

        ~StreamWrapper() { gpuErrchk(cudaStreamDestroy(stream)); }

        cudaStream_t& get() { return stream; }

        cudaStream_t* ref() { return &this->stream; }

    private:
        cudaStream_t stream;
};

int main(){
    int numberOfStreams = 10;
    vector<shared_ptr<StreamWrapper>> streamsTemp(numberOfStreams);

    for (size_t i = 0; i < numberOfStreams; ++i)
    {
        streamsTemp[i] = shared_ptr<StreamWrapper>(new StreamWrapper());
    }

    // Stream testing
    for (size_t i = 0; i < numberOfStreams; ++i)
    {
        int * d_i;
        gpuErrchk(cudaMallocAsync(&d_i, sizeof(int), streamsTemp[i]->get()));
        gpuErrchk(cudaMemcpyAsync(d_i, &i, sizeof(int), cudaMemcpyHostToDevice, streamsTemp[i]->get()));

        int out;
        gpuErrchk(cudaMemcpyAsync(&out, d_i, sizeof(int), cudaMemcpyDeviceToHost, streamsTemp[i]->get()));
        gpuErrchk(cudaFreeAsync(d_i, streamsTemp[i]->get()));
        gpuErrchk(cudaStreamSynchronize(streamsTemp[i]->get()));

        cout << "Out: " << to_string(out) << " In: " << to_string(i);
    }
}

Nameless
  • 85
  • 2
  • 10
  • 3
    The pointer you give the `shared_ptr` to manage becomes dangling as soon as the loop body completes each iteration (because it's an address of an automatic variable). – wohlstad Jul 13 '22 at 14:54
  • You can try to wrap the stream in a RAII wrapper (destorying the stream in the dtor). – wohlstad Jul 13 '22 at 14:58
  • 1
    "I was wondering ... if it is created the object on the heap or not?" the location of a variable, stack or heap, is not unique or specific to CUDA. This: `cudaStream_t stream;`. just like this: `int a;` is typically referred to as a stack-based variable, when that definition appears within `main` or function scope. This: `cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);` doesn't determine the location of the variable. It is merely **setting** that variable to some value. – Robert Crovella Jul 13 '22 at 15:04
  • Sorry yeah, this shared_ptr implementation is not the best; however, cudaStream_t is a pointer itself so when cudaStreamCreateWithFlags is called you could do stream = new Stream object inside the function call. – Nameless Jul 13 '22 at 15:13
  • 1
    Yes, its a pointer ["under the hood"](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1ge15d9c8b7a240312b533d6122558085a), but what it points to (if anything at all) is basically opaque. Therefore I refer to it as a "handle". It is used by the CUDA runtime to identify a specific stream to associate with whenever you use that "handle". And the pointer is located on the stack (in the case here). What exactly it points to, if anything at all, is an unknown, and doesn't need to enter into your design considerations. You just need to create/destroy it. – Robert Crovella Jul 13 '22 at 15:39
  • And, no, you cannot do `cudaStream_t s; s = new cudaStream_t;`, so its not clear what you mean by "you could do stream = new Stream object inside the function call" – Robert Crovella Jul 13 '22 at 15:42
  • 1
    Next time it's better if you don't edit the question to include a possible answer. Instead you can write it as an answer. In this case I already wrote this as an answer (just now). – wohlstad Jul 13 '22 at 16:11
  • Regardless of whether `cudaStream_t` is itself a pointer type or not, `cudaStream_t stream; shared_ptr(&stream, ...);` creates a `shared_ptr` holding a dangling `cudaStream_t*` pointer. If `cudaStream_t` is actually a pointer type, then perhaps try something like this: `struct CUDAStreamDeleter { void operator()(cudaStream_t p) const { cudaStreamDestroy(p); } }; ... cudaStream_t stream; ... shared_ptr>(stream, CUDAStreamDeleter{});` – Remy Lebeau Jul 13 '22 at 17:07

1 Answers1

3

As mentioned in several comment above (including mine), your first attempt involves creating std::shared_ptrs managing dangling pointers.
This is because these pointers are actually addresses of automatic variables created on the stack in the scope of the loop body (and therefore become dangling once the variables get out of scope).

However - you can use the RAII idiom to achieve what you need:
In the code below, StreamWrapper will create the stream in the ctor, and destroy it in the dtor.

Code:

#include "cuda_runtime.h"

#include <vector>
#include <memory>
#include <iostream>
#include <string>


#define gpuErrchk(X) X  // use your current definition of gpuErrchk


// RAII class:
class StreamWrapper {
public:
    StreamWrapper()  { gpuErrchk(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); }
    ~StreamWrapper() { gpuErrchk(cudaStreamDestroy(stream)); }

    cudaStream_t& get() { return stream; }

private:
    cudaStream_t stream;
};


int main() {
    int numberOfStreams = 10;
    std::vector<std::shared_ptr<StreamWrapper>> streamsTemp(numberOfStreams);

    for (size_t i = 0; i < numberOfStreams; ++i)
    {
        streamsTemp[i] = std::make_shared<StreamWrapper>();
    }

    // Stream testing
    for (size_t i = 0; i < numberOfStreams; ++i)
    {
        int* d_i;
        gpuErrchk(cudaMallocAsync(&d_i, sizeof(int), streamsTemp[i]->get()));
        gpuErrchk(cudaMemcpyAsync(d_i, &i, sizeof(int), cudaMemcpyHostToDevice, streamsTemp[i]->get()));

        int out;
        gpuErrchk(cudaMemcpyAsync(&out, d_i, sizeof(int), cudaMemcpyDeviceToHost, streamsTemp[i]->get()));
        gpuErrchk(cudaFreeAsync(d_i, streamsTemp[i]->get()));
        gpuErrchk(cudaStreamSynchronize(streamsTemp[i]->get()));

        std::cout << "Out: " << std::to_string(out) << " In: " << std::to_string(i) << std::endl;
    }
}

Notes:

  1. When initializing a std::shared_ptr it is better to use std::make_shared. See here: Difference in make_shared and normal shared_ptr in C++.
  2. Better to avoid using namespace std - see here: Why is "using namespace std;" considered bad practice?.
wohlstad
  • 12,661
  • 10
  • 26
  • 39