I am looking to analyse the total time spent on the kernels, running multiple time, and was wondering if this code would give me the total spend on the streamed kernels, or if time returned needed to be multiplied by the number of launches.
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for(x=0; x<SIZE; x+=N*2){
gpuErrchk(cudaMemcpyAsync(data_d0, data_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(data_d1, data_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1));
gpuErrchk(cudaMemcpyAsync(array_d0, array_h, wrap->size*sizeof(node_r), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(array_d1, array_h, wrap->size*sizeof(node_r), cudaMemcpyHostToDevice, stream1));
cudaEventRecord(start, 0);
GPU<<<N/512,512,0,stream0>>>(array_d0, data_d0, out_d0 );
GPU<<<N/512,512,0,stream1>>>(array_d1, data_d1, out_d1);
cudaEventRecord(stop, 0);
gpuErrchk(cudaMemcpyAsync(out_h+x, out_d0 , N * sizeof(int), cudaMemcpyDeviceToHost, stream0));
gpuErrchk(cudaMemcpyAsync(out_h+x+N, out_d1 ,N * sizeof(int), cudaMemcpyDeviceToHost, stream1));
}
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Time %f ms\n", elapsedTime);