The following global barrier works on Kepler K10 and not Fermi GTX580:
__global__ void cudaKernel (float* ref1, float* ref2, int* lock, int time, int dim) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int lid = threadIdx.x;
int numT = blockDim.x * gridDim.x;
int numP = int (dim / numT);
int numB = gridDim.x;
for (int t = 0; t < time; ++t) {
// compute @ time t
for (int i = 0; i < numP; ++i) {
int idx = gid + i * numT;
if (idx > 0 && idx < dim - 1)
ref2 [idx] = 0.333f * ((ref1 [idx - 1] + ref1 [idx]) + ref1 [idx + 1]);
}
// global sync
if (lid == 0){
atomicSub (lock, 1);
while (atomicCAS(lock, 0, 0) != 0);
}
__syncthreads();
// copy-back @ time t
for (int i = 0; i < numP; ++i) {
int idx = gid + i * numT;
if (idx > 0 && idx < dim - 1)
ref1 [idx] = ref2 [idx];
}
// global sync
if (lid == 0){
atomicAdd (lock, 1);
while (atomicCAS(lock, numB, numB) != numB);
}
__syncthreads();
}
}
So, by looking at the output sent back to CPU, I noticed that one thread (either 1st or last thread) escapes the barrier and resumes execution earlier than the others. I'm using CUDA 5.0. number of blocks is also always smaller than number of SMs (in my set of runs).
Any idea why the same code wouldn't work on two architectures? What's new in Kepler that helps this global synchronization?