I just found out about the libcu++ library and am trying to use the cuda::atomic
variables. I wrote the following program but it is giving me unexpected results:
#include <atomic>
#include <cuda/atomic>
#include <stdio.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void atomic_test()
{
cuda::atomic<int, cuda::thread_scope_block> x{0};
x.fetch_add(1, cuda::memory_order_seq_cst);
__syncthreads();
int y = x.load(cuda::memory_order_acquire);
printf("(%d %d) - Value of x is %d\n", blockIdx.x, threadIdx.x, y);
}
int main()
{
atomic_test<<<2, 32>>>();
gpuErrchk( cudaDeviceSynchronize() );
return 0;
}
Since there is a __syncthreads()
after the x.fetch_add()
, I expect that every thread in the block reads the same value from x
. But when I run this program, every thread except thread 31 prints 0
and thread 31 prints 32
. Can someone please explain what I am doing wrong and why I am seeing this output?
I tried going through the libcu++ website. But I couldn't get any complete examples except trie.cu and concurrent_hash_table.cu. These examples unfortunately seems a bit too complicated for me.
I also found a stack-overflow solution using semaphores. But it works as I expect it to.