0

I want to make use of the Page-locked Host memory in CUDA to share message between the HOST and DEVICE ,Let me express my ideas through the following examples. I'm not sure whether it is reasonable.

The environment of my machine:

 - Ubuntu 14.04.5 LTS
 - gcc (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
 - CUDA 9.1

I divided my program into four steps as below:

  1. Let's assume that there are two blocks, and for the first block, it do some compute and a signal is generated at the end of the first block;
  2. When the first block completes the function, it informs the CPU terminal, and then the corresponding data is organized in the CPU;
  3. Then, copy the data to the gpu and give the gpu a signal when the data copy is complete;
  4. The second block in gpu is triggered based on the signal in step 3.

After you have know about what I want to do, I have the problem that when I change the data in the page-locked memory(In my program is the signal), It seems they cannot be identified by the opposite device.

For this question, I have tried the following

  • I find that The CUDA compiler may optimizes the value and stores the value in a register,so I cannot get the newest the value in the kernel,so I notice the PTX.
  • I tried to make use of the PTX to prevent the compiler from optimizing part of the code and I get the signal in the kernel successfully, but failed in passing the signal form device to host, which confuse me much.

Part of code of my project is shown below:

__global__ void pipeline(int *flag_a, int*flag_b, int*Input, int*Out){
    int idx = threadIdx.x;
    if (blockIdx.x == 0) {
        if (0 == idx) {
            flag_a[0] = 1;    //to generate signal in the step one 
                              //why the host cannot get the flag_a[0]==1?
        }
    }

    if (blockIdx.x == 1) {
        if (0 == idx) {
            int value = 0;
            do{
                asm volatile("ld.global.cg.u32 %0, [%1];" :"=r"(value) : "l"(&flag_b[0]));
                //receipt signal form the host generate in step 3
                //and the asm volatile to make sure I can get the newest flag_b[0]
            } while (value != 1);
        }
        __syncthreads();
        Out[idx] = Input[idx] + idx;
    }
}

int main()
{
    /*1*/
    int *flag_a, *flag_b;
    cudaHostAlloc((void**)&flag_a, sizeof(int), cudaHostAllocMapped);
    cudaHostAlloc((void**)&flag_b, sizeof(int), cudaHostAllocMapped);
    flag_a[0] = 0;
    flag_b[0] = 0;
    /*2*/
    int*Input, *Out;
    int *d_Input, *d_Out;
    int*d_float_a, *d_float_b;
    Input = (int*)malloc(sizeof(int) * 32);
    Out = (int*)malloc(sizeof(int) * 32);
    for (int i = 0; i<32; i++) {
        Input[i] = i;
    }
    memset(Out, 0, sizeof(int) * 32);

    cudaMalloc((void**)&d_Input, sizeof(int) * 32);
    cudaMemset(d_Input, 0, sizeof(int) * 32);
    cudaMalloc((void**)&d_Out, sizeof(int) * 32);
    cudaMemset(d_Out, 0, sizeof(int) * 32);

    cudaHostGetDevicePointer((void **)&d_float_a, (void *)flag_a, 0);
    cudaHostGetDevicePointer((void **)&d_float_b, (void *)flag_b, 0);

    cudaStream_t stream_kernel, stream_datacopy;
    cudaStreamCreate(&stream_kernel);
    cudaStreamCreate(&stream_datacopy);

    pipeline <<< 2, 32, 0, stream_kernel >>> (d_float_a, d_float_b, d_Input, d_Out);
    int count = 0;
    do{
        if (flag_a[0]==1){
            cudaMemcpyAsync(d_Input, Input, sizeof(int) * 32, cudaMemcpyHostToDevice, stream_datacopy);
            cudaStreamSynchronize(stream_datacopy);
            flag_b[0] = 1;  //step 3;
            break;
        }
        if (count==10)
            break;
    } while (1 != flag_a[0]);

    cudaStreamSynchronize(stream_kernel);
    cudaMemcpy(Out, d_Out, sizeof(int) * 32, cudaMemcpyDeviceToHost);
    for (int i = 0; i<32; i++) {
        printf("%d:%d\n", i, Out[i]);
    }
    // free()
    return 0;
}

I am not very good at CUDA programming and I am not sure if that's the right way to switch signal between the host and device, all I did was just an attempt and if anyone can give me advice, I will be appreciate it,Thanks in advance:)

talonmies
  • 70,661
  • 34
  • 192
  • 269
GeekLee
  • 161
  • 1
  • 2
  • 11
  • 1
    it would be typical to mark the communication variables (pointers) as `volatile`. `asm volatile` does not have a similar behavior as `volatile` decoration on a variable. Furthermore, there may be points where you want to help the process along with an appropriate barrier such as `threadfence_system()`. Finally, Windows WDDM is not your friend here. This [question](https://stackoverflow.com/questions/20345702/how-can-i-check-the-progress-of-matrix-multiplication/20381924#20381924) has some possibly useful info. It shouldn't be necessary to use PTX to make this work. – Robert Crovella Jun 26 '18 at 19:03
  • @RobertCrovella Thanks Robert, I'll refer to your advice and try to reconsider how to make the communication. Also, The transfer of signals between CPU and GPU seems not as convenient as I imagined. I originally thought that the data in Page-Locked Host Memory could be directly shared between CPU and GPU. So,what should I do to ensure the stability of communications? Do you have relevant experience and suggestions for me? Or, Is it convenient for you to give a brief solution to the problem I gave above, which may be very meaningful to me? Thank again! – GeekLee Jun 27 '18 at 03:30
  • @RobertCrovella Another problem is that when I want to release a pointer decorated with volatile(Like this `cudaFreeHost((void *)hptr)` ), it will give an error `unspecified launch failure` , I have no idea about this error after I searched for related information,any opinion on this error? I feel helpless. – GeekLee Jun 28 '18 at 12:24

1 Answers1

1

In the end, I deleted the code of the PTX part and put the code in of Tesla P100-PCIE (TCC mode), which can run the program I expected correctly. Thank RobertCrovella's hint gave in the comment.

Here is the updated code and results.

__global__ void pipeline(volatile float *flag_a, volatile float*flag_b, int*Input, int*Out)
{
    int idx = threadIdx.x;
    if (blockIdx.x == 0) {
        if (0 == idx) {
            flag_a[idx] = 1;    
        }
    }

    if (blockIdx.x == 1) {
        if (0 == idx) {
            while (!(1 == flag_b[0])) {
            }
        }
        __syncthreads();
        Out[idx] = Input[idx] + idx;
    }
}

In the main funtion can get the signal from kernel.

int main()
{
    //Data definition
    pipeline << < 2, 32, 0, stream_kernel >> > (flag_a, flag_b, d_Input, d_Out);
    while (flag_a[0] == 0);
    if (flag_a[0] == 1)
    {
        std::cout << "get the flag_a[0]==1" << std::endl;
        cudaMemcpyAsync(d_Input, Input, sizeof(int) * 32, cudaMemcpyHostToDevice, stream_datacopy);
        cudaStreamSynchronize(stream_datacopy);
        flag_b[0] = 1;
        std::cout << "data transfer has finished" << std::endl;
    }

    cudaStreamSynchronize(stream_kernel);
    cudaMemcpy(Out, d_Out, sizeof(int) * 32, cudaMemcpyDeviceToHost);
    for (int i = 0; i < 32; i++) 
    {
        printf("%d:%d\n", i, Out[i]);
    }
    //free the memory;
    return 0;
}

Here is the result.

GeekLee
  • 161
  • 1
  • 2
  • 11