Please refer to the two snapshots below showing a Nvidia Visual Profiler session of my CUDA code:
Snapshot from nvprof session showing thrust::sort and thrust::reduce call execution timeline
Highlighted the sort and reduce calls to show the times taken and the gap in between their execution
You can see a gap of approximately 70 us between the two thrust::sort()
calls, then there is a big gap between the first thrust::reduce()
and the second thrust::sort()
calls. In all there is approximately 300 us of such gaps visible in the snapshot. I believe these are 'idle' times, perhaps introduced by the thrust library. Anyway, I couldn't find any relevant discussion, or a documentation on this by Nvidia. Can someone please explain why do I have such apparent 'idle' times? Combined, such times account for 40% of my application execution time, so it is a big concern for me!
Also, I have measured that the gaps between calls to successive cuda kernels I wrote is about just 3 us!
I have written a sample cuda code in order to post here:
void profileThrustSortAndReduce(const int ARR_SIZE) {
// for thrust::reduce on first 10% of the sorted array
const int ARR_SIZE_BY_10 = ARR_SIZE / 10;
// generate host random arrays of float
float* h_arr1; cudaMallocHost((void **)&h_arr1, ARR_SIZE * sizeof(float));
float* h_arr2; cudaMallocHost((void **)&h_arr2, ARR_SIZE * sizeof(float));
for (int i = 0; i < ARR_SIZE; i++) {
h_arr1[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX)* 1000.0f;
h_arr2[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX)* 1000.0f;
}
// device arrays populated
float* d_arr1; cudaMalloc((void **)&d_arr1, ARR_SIZE * sizeof(float));
float* d_arr2; cudaMalloc((void **)&d_arr2, ARR_SIZE * sizeof(float));
cudaMemcpy(d_arr1, h_arr1, ARR_SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_arr2, h_arr2, ARR_SIZE * sizeof(float), cudaMemcpyHostToDevice);
// start cuda profiler
cudaProfilerStart();
// sort the two device arrays
thrust::sort(thrust::device, d_arr1, d_arr1 + ARR_SIZE);
thrust::sort(thrust::device, d_arr2, d_arr2 + ARR_SIZE);
// mean of 100 percentiles of device array
float arr1_red_100pc_mean = thrust::reduce(thrust::device, d_arr1, d_arr1 + ARR_SIZE) / ARR_SIZE;
// mean of smallest 10 percentiles of device array
float arr1_red_10pc_mean = thrust::reduce(thrust::device, d_arr1, d_arr1 + ARR_SIZE_BY_10) / ARR_SIZE_BY_10;
// mean of 100 percentiles of device array
float arr2_red_100pc_mean = thrust::reduce(thrust::device, d_arr2, d_arr2 + ARR_SIZE) / ARR_SIZE;
// mean of smallest 10 percentiles of device array
float arr2_red_10pc_mean = thrust::reduce(thrust::device, d_arr2, d_arr2 + ARR_SIZE_BY_10) / ARR_SIZE_BY_10;
// stop cuda profiler
cudaProfilerStop();
}
Snapshot of nvprof session of this sample function