I am trying to compute a histogram using some shared memory to improve performance. However I am running into a problem that I don't seem to figure out. Here is the kernel code i am having problem with. i am sure I am missing something silly but i can't locate it.
__global__
void histogram_kernel_shared(const unsigned int* const d_vals,
unsigned int* d_histo,
const unsigned int numElems) {
unsigned int gid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int lid = threadIdx.x;
unsigned int bin = d_vals[gid];
__syncthreads();
__shared__ unsigned int local_bin[1024];
local_bin[lid] = d_histo[lid];
__syncthreads();
if(local_bin[lid] != d_histo[lid])
printf("After copy to local. block = %u, lid = %u, local_bin = %u, d_histo = %u \n", blockIdx.x, lid, local_bin[lid], d_histo[lid]);
__syncthreads();
// If I comment out this line everything works fine.
d_histo[lid] = local_bin[lid];
// Even this leads to some wrong answers. Printouts on the next printf.
// d_histo[lid] = d_histo[lid];
__syncthreads();
if(local_bin[lid] != d_histo[lid])
printf("copy back. block = %u, lid = %u, local_bin = %u, d_histo = %u \n", blockIdx.x, lid, local_bin[lid], d_histo[lid]);
__syncthreads();
atomicAdd(&d_histo[bin], static_cast<unsigned int>(1));
__syncthreads();
// atomicAdd(&local_bin[bin], static_cast<unsigned int>(1));
__syncthreads();
}
the kernel is launched as follows
threads = 1024;
blocks = numElems/threads;
histogram_kernel_shared<<<blocks, threads>>>(d_vals, d_histo, numElems);
number of elements is 10,240,000
and number of Bins is 1024.
What is bugging me is why should the assignment d_histo[lid] = local_bin[lid];
make difference here. Code runs fine without it. But nothing should change by that assignemtn since I just copied the value as local_bin[lid] = d_histo[lid];
and even more why does local_bin[lid] = d_histo[lid];
gives garbage values as well?
My guess is that something else is wrong somewhere else giving some odd kind of UB but Where?
Thanks for the help.