Here is a demo. The kernel cannot overlap with previous cudaMemcpyAsync, although they are in different streams.
#include <iostream>
#include <cuda_runtime.h>
__global__ void warmUp(){
int Id = blockIdx.x*blockDim.x+threadIdx.x;
if(Id == 0){
printf("warm up!");
}
}
__global__ void kernel(){
int Id = blockIdx.x*blockDim.x+threadIdx.x;
if(Id == 0){
long long x = 0;
for(int i=0; i<1000000; i++){
x += i>>1;
}
printf("kernel!%d\n", x);
}
}
int main(){
//warmUp<<<1,32>>>();
int *data, *data_dev;
int dataSize = pow(10, 7);
cudaMallocHost(&data, dataSize*sizeof(int));
cudaMalloc(&data_dev, dataSize*sizeof(int));
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaMemcpyAsync(data_dev, data, dataSize*sizeof(int), cudaMemcpyHostToDevice, stream1);
kernel<<<1, 32, 0, stream2>>>();
}
After some attempts, I found out that this is due to it being the first kernel call.
Uncomment warmUp<<<1,32>>>();
, Visual Profiler show, overlap!
Why?