I'm trying to understand how to use __threadfence()
, as it seems like a powerful synchronization primitive that lets different blocks work together without going through the huge hassle of ending a kernel and starting a new one. The CUDA C Programming guide has an example of it (Appendix B.5), which is fleshed out in the "threadFenceReduction" sample in the SDK, so it seems like something we "should" be using.
However, when I have tried using __threadfence()
, it is shockingly slow. See the code below for an example. From what I understand, __threadfence()
should just make sure that all pending memory transfers from the current thread block are finished, before proceeding. Memory latency is somewhat better than a microsecond, I believe, so the total time to deal with the 64KB of memory transfers in the included code, on a GTX680, should be somewhere around a microsecond. Instead, the __threadfence()
instruction seems to take around 20
microseconds! Instead of using __threadfence()
to synchronize, I can instead end the kernel, and launch an entirely new kernel (in the same, default, stream so that it is synchronized), in less then a third of the time!
What is going on here? Does my code have a bug in it that I'm not noticing? Or is __threadfence()
really 20x
slower than it should be, and 6x
slower than an entire kernel launch+cleanup?
Time for 1000 runs of the threadfence kernel: 27.716831 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 25.962912 ms
Synchronizing without threadfence, by splitting to two kernels: 7.653344 ms
Answer: 120
#include "cuda.h"
#include <cstdio>
__device__ unsigned int count = 0;
__shared__ bool isLastBlockDone;
__device__ int scratch[16];
__device__ int junk[16000];
__device__ int answer;
__global__ void usethreadfence() //just like the code example in B.5 of the CUDA C Programming Guide
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x; //do some more memory writes to make the kernel nontrivial
__threadfence();
if (threadIdx.x==0) {
unsigned int value = atomicInc(&count, gridDim.x);
isLastBlockDone = (value == (gridDim.x - 1));
}
__syncthreads();
if (isLastBlockDone && threadIdx.x==0) {
// The last block sums the results stored in scratch[0 .. gridDim.x-1]
int sum=0;
for (int i=0;i<gridDim.x;i++) sum+=scratch[i];
answer=sum;
}
}
__global__ void justthreadfence() //first three lines of the previous kernel, so we can compare speeds
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;
__threadfence();
}
__global__ void usetwokernels_1() //this and the next kernel reproduce the functionality of the first kernel, but faster!
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;
}
__global__ void usetwokernels_2()
{
if (threadIdx.x==0) {
int sum=0;
for (int i=0;i<gridDim.x;i++) sum+=scratch[i];
answer=sum;
}
}
int main() {
int sum;
cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) usethreadfence<<<16,1000>>>();
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Time for 1000 runs of the threadfence kernel: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaMemcpyFromSymbol(&sum,answer,sizeof(int)); printf("Answer: %d\n",sum);
cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) justthreadfence<<<16,1000>>>();
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Time for 1000 runs of just the first 3 lines, including threadfence: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) {usetwokernels_1<<<16,1000>>>(); usetwokernels_2<<<16,1000>>>();}
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Synchronizing without threadfence, by splitting to two kernels: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaMemcpyFromSymbol(&sum,answer,sizeof(int)); printf("Answer: %d\n",sum);
}