Is it possible to launch multiple streams simultaneously in device function?

Question

Properties : Win 10, VS 2013, CUDA 7.5, GeForce 920M.
There isn't any error or warning for both case. Output is SAME for both case. Only difference is:

In second case Stream2 and Stream3 does not exist.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "../../common/common.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>

__global__ void kernel1(char *value){
    for (int i = 0; i < 100; i++){
        printf("%s\n", value);
    }
}

__global__ void kernel2(){
    cudaStream_t s1, s2;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);

    kernel1 << < 1, 1, 0, s1 >> >("up stream");
    kernel1 << < 1, 1, 0, s2 >> >("bottom stream");
}

int main(int argc, char **argv){
    printf("%s Starting...\n", argv[0]);    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Using Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));
    // FIRST CASE
    //cudaStream_t s1, s2;
    //cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    //cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
    //kernel1 << <1, 1, 0, s1 >> >();
    //kernel1 << <1, 1, 0, s2 >> >();

    //SECOND CASE
    kernel2 << < 1, 1>> >();

    CHECK(cudaDeviceSynchronize());
    CHECK(cudaGetLastError());  // check kernel error
    CHECK(cudaDeviceReset());   // reset device

    printf("\nEnd\n");
    getchar();
    return (0);
}

i also add common.h

#include <time.h>
#include <stdio.h>

#ifndef _COMMON_H
#define _COMMON_H

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        getchar();exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUBLAS(call)                                                     \
{                                                                              \
    cublasStatus_t err;                                                        \
    if ((err = (call)) != CUBLAS_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        getchar();exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CURAND(call)                                                     \
{                                                                              \
    curandStatus_t err;                                                        \
    if ((err = (call)) != CURAND_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        getchar();exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult err;                                                           \
    if ( (err = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__,        \
                __LINE__);                                                     \
        getchar();exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUSPARSE(call)                                                   \
{                                                                              \
    cusparseStatus_t err;                                                      \
    if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)                             \
    {                                                                          \
        fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__);   \
        cudaError_t cuda_err = cudaGetLastError();                             \
        if (cuda_err != cudaSuccess)                                           \
        {                                                                      \
            fprintf(stderr, "  CUDA error \"%s\" also detected\n",             \
                    cudaGetErrorString(cuda_err));                             \
        }                                                                      \
        getchar();exit(1);                                                               \
    }                                                                          \
}
clock_t seconds()
{
    return clock();
}
#endif // _COMMON_H

What does "i can't start multiple stream parallel in device" mean - [what happens if you try](http://stackoverflow.com/help/mcve)? — tera, Sep 08 '16 at 10:17
You have reworded the question indeed. But it still doesn't say what happens if you try. We can't solve your problem if you don't let us know what your problem is. — tera, Sep 09 '16 at 14:48
you need to provide full code, specify your gpu and cuda versionm and show us how the problem manufests (at compilation or execution stage) — Bulat, Sep 09 '16 at 14:56

score 1 · Accepted Answer · edited May 23 '17 at 10:29

Two possibilities:

Windows WDDM mode is not the best operating mode for a GPU when you are interested in various kinds of concurrency. This is covered in many other places such as this recent SO question.
The tool you are using (nsight VSE) may not be giving you the best information, or else you may be misinterpreting it (you haven't provided a screenshot of exactly what you are seeing.)

Another thing you should be aware of is that your GT 920M (GK208) is a fairly low-end GPU, which may be limited in terms of kernel concurrency since it has only 2 SMs, but I don't think this is a limiting factor for this specific case.

I modified your posted code to uncomment the kernel2 launch (since that is really the one you are asking about) and ran it on CUDA 7.5, Fedora 20, GT 640 (which also uses GK208). Without making any other changes, this is the output I see in nvvp on linux:

What we see is:

kernel1 shows up first, being launched twice in two different streams. This corresponds to your host-side launch of kernel1 and we witness kernel concurrency here.
after that, kernel2 is launched from the host. It is launched into the default stream according to your code, and it shows up in nvvp that way as well. Furthermore, we note that kernel2 in the nvvp compute timeline shows up as a solid bar for the first portion of its duration, and a hollow bar for the last portion of its duration. This signifies that all threads of kernel2 have completed at the end of the solid bar, but kernel2 completion is held up due to implicit child kernel synchronization associated with CDP.
In this "hollow bar area" of kernel2 on the timeline, we also see 2 new instances of kernel1. These 2 new instances are overlapped with each other, indicating proper concurrency, and at the completion of both of these child kernels, the parent kernel2 launch also finishes.

Basically, everything looks correct to me when I run it on linux, using nvvp. If you want to explore concurrency in an unlimited fashion, I definitely recommend windows TCC mode or linux over windows WDDM mode. Most GeForce GPUs cannot be placed into TCC mode however. You might ask why the child kernel launches don't have a separate stream timeline identified for them. I don't know the answer to that, but I assume it is some sort of tool limitation. There may also be a "presentation" issue: the parent kernel belongs to the default stream, therefore its child kernels do also (even though you have to create separate streams for them for concurrency purposes). If this bothers you, you could consider filing an enhancement request (bug with keyword RFE included) at developer.nvidia.com

I changed kernel1 and kernel2. And output is: up stream bottom stream up stream bottom stream .... .... Now can we say this is tool limitation? — Celil Özkurt, Sep 09 '16 at 19:16
Also my kernel2 does not like your screenshot but kernel1 look like. — Celil Özkurt, Sep 09 '16 at 19:37
[Device streams only exist within the scope of the enclosing thread block](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams-and-events), and stream designators may be reused thereafter. This limits how this information can be collected and presented in the profiler. — tera, Sep 09 '16 at 19:57
nsight VSE has [considerable support](http://on-demand.gputechconf.com/gtc/2013/presentations/S3478-Debugging-CUDA-Kernel-Code.pdf) for handling CDP. My guess is that you are just not interpreting the output correctly. Have you expanded the compute timeline to show all the sub-timelines? — Robert Crovella, Sep 09 '16 at 20:02

Is it possible to launch multiple streams simultaneously in device function?

1 Answers1