I want to make use of the Page-locked Host memory
in CUDA to share message between the HOST and DEVICE ,Let me express my ideas through the following examples. I'm not sure whether it is reasonable.
The environment of my machine:
- Ubuntu 14.04.5 LTS
- gcc (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
- CUDA 9.1
I divided my program into four steps as below:
- Let's assume that there are two blocks, and for the first block, it do some compute and a signal is generated at the end of the first block;
- When the first block completes the function, it informs the CPU terminal, and then the corresponding data is organized in the CPU;
- Then, copy the data to the gpu and give the gpu a signal when the data copy is complete;
- The second block in gpu is triggered based on the signal in step 3.
After you have know about what I want to do, I have the problem that when I change the data in the page-locked memory(In my program is the signal), It seems they cannot be identified by the opposite device.
For this question, I have tried the following
- I find that The CUDA compiler may optimizes the value and stores the value in a register,so I cannot get the newest the value in the kernel,so I notice the PTX.
- I tried to make use of the PTX to prevent the compiler from optimizing part of the code and I get the signal in the kernel successfully, but failed in passing the signal form device to host, which confuse me much.
Part of code of my project is shown below:
__global__ void pipeline(int *flag_a, int*flag_b, int*Input, int*Out){
int idx = threadIdx.x;
if (blockIdx.x == 0) {
if (0 == idx) {
flag_a[0] = 1; //to generate signal in the step one
//why the host cannot get the flag_a[0]==1?
}
}
if (blockIdx.x == 1) {
if (0 == idx) {
int value = 0;
do{
asm volatile("ld.global.cg.u32 %0, [%1];" :"=r"(value) : "l"(&flag_b[0]));
//receipt signal form the host generate in step 3
//and the asm volatile to make sure I can get the newest flag_b[0]
} while (value != 1);
}
__syncthreads();
Out[idx] = Input[idx] + idx;
}
}
int main()
{
/*1*/
int *flag_a, *flag_b;
cudaHostAlloc((void**)&flag_a, sizeof(int), cudaHostAllocMapped);
cudaHostAlloc((void**)&flag_b, sizeof(int), cudaHostAllocMapped);
flag_a[0] = 0;
flag_b[0] = 0;
/*2*/
int*Input, *Out;
int *d_Input, *d_Out;
int*d_float_a, *d_float_b;
Input = (int*)malloc(sizeof(int) * 32);
Out = (int*)malloc(sizeof(int) * 32);
for (int i = 0; i<32; i++) {
Input[i] = i;
}
memset(Out, 0, sizeof(int) * 32);
cudaMalloc((void**)&d_Input, sizeof(int) * 32);
cudaMemset(d_Input, 0, sizeof(int) * 32);
cudaMalloc((void**)&d_Out, sizeof(int) * 32);
cudaMemset(d_Out, 0, sizeof(int) * 32);
cudaHostGetDevicePointer((void **)&d_float_a, (void *)flag_a, 0);
cudaHostGetDevicePointer((void **)&d_float_b, (void *)flag_b, 0);
cudaStream_t stream_kernel, stream_datacopy;
cudaStreamCreate(&stream_kernel);
cudaStreamCreate(&stream_datacopy);
pipeline <<< 2, 32, 0, stream_kernel >>> (d_float_a, d_float_b, d_Input, d_Out);
int count = 0;
do{
if (flag_a[0]==1){
cudaMemcpyAsync(d_Input, Input, sizeof(int) * 32, cudaMemcpyHostToDevice, stream_datacopy);
cudaStreamSynchronize(stream_datacopy);
flag_b[0] = 1; //step 3;
break;
}
if (count==10)
break;
} while (1 != flag_a[0]);
cudaStreamSynchronize(stream_kernel);
cudaMemcpy(Out, d_Out, sizeof(int) * 32, cudaMemcpyDeviceToHost);
for (int i = 0; i<32; i++) {
printf("%d:%d\n", i, Out[i]);
}
// free()
return 0;
}
I am not very good at CUDA programming and I am not sure if that's the right way to switch signal between the host and device, all I did was just an attempt and if anyone can give me advice, I will be appreciate it,Thanks in advance:)