I am trying to make a CUDA stream instance automatically delete itself once all its usages have been removed and I was wondering if when calling cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)
, if it is created the object on the heap or not? (I am assuming it is but I am not sure).
In the end I want to do something like this:
struct CUDAStreamDeleter {
void operator()(cudaStream_t* p) const
{
cudaStreamDestroy(*p);
}
};
int main() {
int numberOfStreams = 4;
vector<shared_ptr<cudaStream_t>> streams(numberOfStreams);
for (size_t i = 0; i < numberOfStreams; ++i)
{
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
streams[i] = shared_ptr<cudaStream_t>(&stream, CUDAStreamDeleter());
}
}
Edit:
As per @wohlstad a better template would be:
class StreamWrapper {
public:
StreamWrapper() {
gpuErrchk(cudaStreamCreateWithFlags(&this->stream, cudaStreamNonBlocking));
}
~StreamWrapper() { gpuErrchk(cudaStreamDestroy(stream)); }
cudaStream_t& get() { return stream; }
cudaStream_t* ref() { return &this->stream; }
private:
cudaStream_t stream;
};
int main(){
int numberOfStreams = 10;
vector<shared_ptr<StreamWrapper>> streamsTemp(numberOfStreams);
for (size_t i = 0; i < numberOfStreams; ++i)
{
streamsTemp[i] = shared_ptr<StreamWrapper>(new StreamWrapper());
}
// Stream testing
for (size_t i = 0; i < numberOfStreams; ++i)
{
int * d_i;
gpuErrchk(cudaMallocAsync(&d_i, sizeof(int), streamsTemp[i]->get()));
gpuErrchk(cudaMemcpyAsync(d_i, &i, sizeof(int), cudaMemcpyHostToDevice, streamsTemp[i]->get()));
int out;
gpuErrchk(cudaMemcpyAsync(&out, d_i, sizeof(int), cudaMemcpyDeviceToHost, streamsTemp[i]->get()));
gpuErrchk(cudaFreeAsync(d_i, streamsTemp[i]->get()));
gpuErrchk(cudaStreamSynchronize(streamsTemp[i]->get()));
cout << "Out: " << to_string(out) << " In: " << to_string(i);
}
}