I am beginning to learn cuda programming. In learning the streams and the async/sync features, I have encountered some problems. As said in the Nvidia docs and many sources, the cudaMemcpyAsync
can be used to realize the overlapping of data transfer and kernel execution in different streams, but the host memory involved must be page-locked. And I am curious about how this function would behave if the memory was not pinned? So I write the program below to test:
void print_int(int* src,size_t len){
for(size_t i=0;i<len;++i){
printf(" %d ",src[i]);
}
printf("\n");
}
__global__ void test_kernel(int* input_ptr,int* output_ptr, size_t stream_i){
size_t idx=threadIdx.x;
input_ptr+=stream_i*10;
output_ptr+=stream_i*10;
output_ptr[idx]=input_ptr[idx]*(stream_i+1);
}
void test(){
cudaStream_t st[3];
for(size_t i=0;i<3;++i){
cudaStreamCreate(st+i);
}
size_t const data_len=10;
size_t const streams_num=3;
int* host_ptr=(int*)malloc(sizeof(int)*30);
for(size_t i=0;i<data_len*streams_num;i++){
host_ptr[i]=i+1;
}
printf("initial array value:\n");
print_int(host_ptr,30);
int *dev_input,*dev_output;
cudaMalloc(&dev_input,sizeof(int)*data_len*streams_num);
cudaMalloc(&dev_output,sizeof(int)*data_len*streams_num);
cudaMemcpy(dev_input,host_ptr,sizeof(int)*3*10,cudaMemcpyHostToDevice);
test_stream(st,dev_input,dev_output,host_ptr);
cudaMemcpy(host_ptr,dev_output,sizeof(int)*3*10,cudaMemcpyDeviceToHost);
printf("result array value:\n");
print_int(host_ptr,30);
}
int main(){
test();
return 0;
}
In the code above, the host array is initialized as {1,2,...,30}
, and the kernel function just multiplies each value with the last parameter. With the initial test_stream()
looks like this:
void test_stream(cudaStream_t* streams, int* dev_in, int* dev_out, int* host){
test_kernel<<<1,10,0,streams[0]>>>(dev_in,dev_out,0);
test_kernel<<<1,10,0,streams[0]>>>(dev_in,dev_out,1);
}
The result will be:
result array value:
1 2 3 4 5 6 7 8 9 10 22 24 26 28 30 32 34 36 38 40 0 0 0 0 0 0 0 0 0 0
Then, I add a cudaMemcpyAsync()
call between the issuing of the two streams:
void test_stream(cudaStream_t* streams, int* dev_in, int* dev_out, int* host){
int* arr;
cudaHostAlloc(&arr,sizeof(int)*10,cudaHostAllocDefault);
for(size_t i=0;i<10;i++){
arr[i]=-1;
}
test_kernel<<<1,10,0,streams[0]>>>(dev_in,dev_out,0);
cudaMemcpyAsync(dev_in+10,arr,sizeof(int)*10,cudaMemcpyHostToDevice,streams[1]);
test_kernel<<<1,10,0,streams[1]>>>(dev_in,dev_out,1);
}
The result be like:
result array value:
1 2 3 4 5 6 7 8 9 10 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 0 0 0 0 0 0 0 0 0 0
So the second kernel function is executed after the data copy has been done as they are issued on the same stream. Then I changed the cudaMemcpyAsync()
into cudaMemcpyAsync(dev_in+10,arr,sizeof(int)*10,cudaMemcpyHostToDevice,streams[2]);
, the result goes back to the initial one:
result array value:
1 2 3 4 5 6 7 8 9 10 22 24 26 28 30 32 34 36 38 40 0 0 0 0 0 0 0 0 0 0
I think the result shows the operations are async, and the kernel execution overlaps with the data transfer, so the second kernel didn't see the data change from cudaMemcpyAsync()
. Then I changed the arr
to static int arrays:
static int arr[30];
//cudaHostAlloc(&arr,sizeof(int)*10,cudaHostAllocDefault);
for(size_t i=0;i<10;i++){
arr[i]=-1;
}
test_kernel<<<1,10,0,streams[0]>>>(dev_in,dev_out,0);
cudaMemcpyAsync(dev_in+10,arr,sizeof(int)*10,cudaMemcpyHostToDevice,streams[2]);
test_kernel<<<1,10,0,streams[1]>>>(dev_in,dev_out,1);
The result is:
result array value:
1 2 3 4 5 6 7 8 9 10 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 0 0 0 0 0 0 0 0 0 0
That seems to be right, the arr[30]
is not allocated through cudaHostAlloc
, so the memory access didn't overlap with the streams[1]
kernel. But when I add another line to the code:
static int arr[30];
//cudaHostAlloc(&arr,sizeof(int)*10,cudaHostAllocDefault);
for(size_t i=0;i<10;i++){
arr[i]=-1;
}
test_kernel<<<1,10,0,streams[0]>>>(dev_in,dev_out,0);
test_kernel<<<1,10,0,streams[2]>>>(dev_in,dev_out,2);
cudaMemcpyAsync(dev_in+10,arr,sizeof(int)*10,cudaMemcpyHostToDevice,streams[2]);
test_kernel<<<1,10,0,streams[1]>>>(dev_in,dev_out,1);
The result becomes:
result array value
1 2 3 4 5 6 7 8 9 10 22 24 26 28 30 32 34 36 38 40 63 66 69 72 75 78 81 84 87 90
Note that the kernel on streams[2]
finished its job, modified the 21~30 elements, but additionally, it also made the memory copy process async again! But why would this happen? I can understand if the operation involves memory not pinned, it could become a sync operation. But why would a kernel execution on streams[2]
before the memory copy make it behave like it is dealing with a pinned memory and overlap with the kernel on streams[1]
?
I have also tested an empty kernel on streams[2]
and got the same result. It seems a kernel function issued on the same stream will always make the cudaMemcpyAsync
followed more "async"? But why?
And I also tried removing the static
and make arr a normal array, which produced the same result as the pinned memory allocated by cudaHostAlloc
, another thing I cannot understand. What is the difference here? Does "static" make a variable less "pinned"?
In order to simplify the question, I classify the result as "sync_result" and "async_result" according to the value at the position 11~20 in the result array, when the test_stream
calls cudaMemcpyAsync
with parameter streams[2]
. The "sync_result" is "-2 -2 -2 ..." and the "async_result" is "22 24 ...". In conclusion, the results are:
- pinned memory from
cudaHostAlloc
: "async_result" - static int: "sync_result"
- static int with kernel issued on the same stream:
"async_result" - int: "async_result"
Why would the results be like these? Specially, why would issuing a kernel before the cudaMemcpyAsync
on the same stream changes its behavior on overlapping the other kernel?