CUDA reduction: atomic operations not impacting performance?

Question

I'm testing the effects of inserting atomic addition operations into optimized array reduction kernels to measure the performance impact. I'm failing to understand the results. I've tested five different kernels:

0 - fully optimized reduction kernel as provided in samples/6_Advanced/reduction/reduction_kernel.cu  
1 - optimized reduction kernel as described in samples/6_Advanced/docs/reduction.pdf  
2 - kernel 1 with atomic warp-synchronous reduction  
3 - kernel 2 with completely atomic reduction within all shared memory  
4 - kernel 3 with completely atomic reduction

The average reduction time for the device I'm using on a sufficiently large sample of elements:

0 - 0.00103s  
1 - 0.00103s  
2 - 0.00103s  
3 - 0.00103s  
4 - 0.00117s

Why do atomic operations appear to have no impact whatsoever on kernels 2 or 3 and some small impact on kernel 4?

Here is the full code. The relevant kernels are:

  /////////////////
 // warp reduce //
/////////////////
/* warp-synchronous reduction using volatile memory
 * to prevent instruction reordering for non-atomic
 * operations */

template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, int tid) {
  if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
  if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
  if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
  if (blockSize >=  8) sdata[tid] += sdata[tid + 4];
  if (blockSize >=  4) sdata[tid] += sdata[tid + 2];
  if (blockSize >=  2) sdata[tid] += sdata[tid + 1];
}

  ////////////////////////
 // atomic warp reduce //
////////////////////////
/* warp-synchronous reduction using atomic operations
 * to serialize computation */

template <unsigned int blockSize>
__device__ void atomicWarpReduce(int *sdata, int tid) {
  if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]);
  if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]);
  if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]);
  if (blockSize >=  8) atomicAdd(&sdata[tid], sdata[tid + 4]);
  if (blockSize >=  4) atomicAdd(&sdata[tid], sdata[tid + 2]);
  if (blockSize >=  2) atomicAdd(&sdata[tid], sdata[tid + 1]);
}

  ////////////////////////
 // reduction kernel 0 //
////////////////////////
/* fastest reduction algorithm provided by
 * cuda/samples/6_Advanced/reduction/reduction_kernel.cu */

template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  int sum = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sum += g_idata[i];
    // check bounds
    if (nIsPow2 || i + blockSize < n)
      sum += g_idata[i + blockSize];
    i += gridSize;
  }
  // local sum -> shared memory
  sdata[tid] = sum;
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] = sum = sum + sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] = sum = sum + sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] = sum = sum + sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) {
    // warp-synchronous reduction
    // volatile memory stores won't be reordered by compiler
    volatile int *smem = sdata;
    if (blockSize >= 64)
      smem[tid] = sum = sum + smem[tid + 32];
    if (blockSize >= 32)
      smem[tid] = sum = sum + smem[tid + 16];
    if (blockSize >= 16)
      smem[tid] = sum = sum + smem[tid + 8];
    if (blockSize >= 8)
      smem[tid] = sum = sum + smem[tid + 4];
    if (blockSize >= 4)
      smem[tid] = sum = sum + smem[tid + 2];
    if (blockSize >= 2)
      smem[tid] = sum = sum + smem[tid + 1];
  }
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 1  //
/////////////////////////
/* fastest reduction alrogithm described in
 * cuda/samples/6_Advanced/reduction/doc/reduction.pdf */

template <unsigned int blockSize>
__global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] += sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] += sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] += sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) warpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 2  //
/////////////////////////
/* reduction kernel 1 executed
 * with atomic warp-synchronous addition */

template <unsigned int blockSize>
__global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      sdata[tid] += sdata[tid + 256];
  __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      sdata[tid] += sdata[tid + 128];
  __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      sdata[tid] += sdata[tid + 64];
  __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 3  //
/////////////////////////

template <unsigned int blockSize>
__global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    sdata[tid] += g_idata[i] + g_idata[i+blockSize];
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      atomicAdd(&sdata[tid], sdata[tid + 256]);
    __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      atomicAdd(&sdata[tid], sdata[tid + 128]);
    __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      atomicAdd(&sdata[tid], sdata[tid + 64]);
    __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

  /////////////////////////
 // reduction kernel 4  //
/////////////////////////

template <unsigned int blockSize>
__global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) {
  extern __shared__ int sdata[];
  // first level of reduction (global -> shared)
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
  unsigned int gridSize = blockSize * 2 * gridDim.x;
  sdata[tid] = 0;
  // reduce multiple elements per thread
  while (i < n) {
    atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize]));
    i += gridSize;
  }
  __syncthreads();
  // reduce in shared memory
  if (blockSize >= 512) {
    if (tid < 256)
      atomicAdd(&sdata[tid], sdata[tid + 256]);
    __syncthreads();
  }
  if (blockSize >= 256) {
    if (tid < 128)
      atomicAdd(&sdata[tid], sdata[tid + 128]);
    __syncthreads();
  }
  if (blockSize >= 128) {
    if (tid < 64)
      atomicAdd(&sdata[tid], sdata[tid + 64]);
    __syncthreads();
  }
  if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
  // write result for block to global memory
  if (tid == 0)
    g_odata[blockIdx.x] = sdata[0];
}

The obvious first question is are you certain that the time measurements are correct? And the second is what device are you running these tests on? — talonmies, Jun 19 '13 at 18:31
I believe that the time measurements are correct. Most of the time measurement code is from the sample reduction.cpp and it behaves logically when I throw it a very inefficient algorithm or a huge array. The device is a Quadro 4000. — user1743798, Jun 19 '13 at 18:39
The header file only contains a forward declaration for the kernel wrapper function. — user1743798, Jun 19 '13 at 21:52
Use NVIDIA Compute Profiler and edit your post with the results. — Fr34K, Jun 24 '13 at 11:18
Even though it's been 3 years, user1743798 - please accept the answer or explain why it's incorrect. — einpoklum, Oct 16 '16 at 21:21

score 2 · Answer 1 · edited May 23 '17 at 10:30

In your code, you are not using proper CUDA error checking for the kernel invocations. Since the timings are all the same, I strongly suspect that your kernels are not really launched. I have verified on my own CUDA reduction setup that the same timings are achieved when the number of reduction elements is 1<<24. The above CUDA error check returns an invalid configuration argument.

I size the opportunity to mention that your atomicWarpReduce __device__ function is actually incorrect since it lacks proper synchronization (see also the thread Removing __syncthreads() in CUDA warp-level reduction). The correct version is

template <class T>
__device__ void atomicWarpReduce(T *sdata, int tid) {
    atomicAdd(&sdata[tid], sdata[tid + 32]); __syncthreads();
    atomicAdd(&sdata[tid], sdata[tid + 16]); __syncthreads();
    atomicAdd(&sdata[tid], sdata[tid + 8]); __syncthreads();
    atomicAdd(&sdata[tid], sdata[tid + 4]); __syncthreads();
    atomicAdd(&sdata[tid], sdata[tid + 2]); __syncthreads();
    atomicAdd(&sdata[tid], sdata[tid + 1]); __syncthreads();
}

Of course you don't need atomics in this case, and I understand that it is simply for understanding. But atomics does not enforce synchronization, only it avoids race conditions (which are not anyway present) by making the accesses to the shared memory array sdata sequential. You may wish to compare the disassembled code for

YOUR VERSION

    Function : _Z18reduce4_atomicWarpIiEvPT_S1_j
.headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_CTAID.X;                             /* 0x2c00000094001c04 */
    /*0010*/         SHL R3, R0, 0x1;                                /* 0x6000c0000400dc03 */
    /*0018*/         S2R R2, SR_TID.X;                               /* 0x2c00000084009c04 */
    /*0020*/         IMAD R3, R3, c[0x0][0x8], R2;                   /* 0x200440002030dca3 */
    /*0028*/         IADD R4, R3, c[0x0][0x8];                       /* 0x4800400020311c03 */
    /*0030*/         ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT;  /* 0x188e4000a031dc03 */
    /*0038*/         ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT;  /* 0x1b0e4000a043dc03 */
    /*0040*/     @P0 ISCADD R3, R3, c[0x0][0x20], 0x2;               /* 0x400040008030c043 */
    /*0048*/    @!P1 ISCADD R4, R4, c[0x0][0x20], 0x2;               /* 0x4000400080412443 */
    /*0050*/    @!P0 MOV R5, RZ;                                     /* 0x28000000fc0161e4 */
    /*0058*/    @!P1 LD R4, [R4];                                    /* 0x8000000000412485 */
    /*0060*/     @P0 LD R5, [R3];                                    /* 0x8000000000314085 */
    /*0068*/         SHL R3, R2, 0x2;                                /* 0x6000c0000820dc03 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/    @!P1 IADD R5, R4, R5;                                /* 0x4800000014416403 */
    /*0080*/         MOV R4, c[0x0][0x8];                            /* 0x2800400020011de4 */
    /*0088*/         STS [R3], R5;                                   /* 0xc900000000315c85 */
    /*0090*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    /* 0x50ee0000ffffdc04 */
    /*0098*/         MOV R6, c[0x0][0x8];                            /* 0x2800400020019de4 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R6, 0x42, PT;          /* 0x188ec0010861dc03 */
    /*00a8*/     @P0 BRA 0x118;                                      /* 0x40000001a00001e7 */
    /*00b0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00b8*/         NOP;                                            /* 0x4000000000001de4 */
    /*00c0*/         MOV R6, R4;                                     /* 0x2800000010019de4 */
    /*00c8*/         SHR.U32 R4, R4, 0x1;                            /* 0x5800c00004411c03 */
    /*00d0*/         ISETP.GE.U32.AND P0, PT, R2, R4, PT;            /* 0x1b0e00001021dc03 */
    /*00d8*/    @!P0 IADD R7, R4, R2;                                /* 0x480000000841e003 */
    /*00e0*/    @!P0 SHL R7, R7, 0x2;                                /* 0x6000c0000871e003 */
    /*00e8*/    @!P0 LDS R7, [R7];                                   /* 0xc10000000071e085 */
    /*00f0*/    @!P0 IADD R5, R7, R5;                                /* 0x4800000014716003 */
    /*00f8*/    @!P0 STS [R3], R5;                                   /* 0xc900000000316085 */
    /*0100*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    /* 0x50ee0000ffffdc04 */
    /*0108*/         ISETP.GT.U32.AND P0, PT, R6, 0x83, PT;          /* 0x1a0ec0020c61dc03 */
    /*0110*/     @P0 BRA 0xc0;                                       /* 0x4003fffea00001e7 */
    /*0118*/         ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT;          /* 0x1a0ec0007c21dc03 */
    /*0120*/         SSY 0x2a8;                                      /* 0x6000000600000007 */
    /*0128*/     @P0 BRA 0x2a0;                                      /* 0x40000005c00001e7 */
    /*0130*/         LDS R4, [R3+0x80];                              R4 = sdata[tid + 32]
    /*0138*/         SSY 0x168;                                      
    /*0140*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*0148*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*0150*/     @P0 STSUL [R3], R5;                                 sdata[tid] = R5 (store to shared memory and unlock)
    /*0158*/    @!P0 BRA 0x140;                                      /* 0x4003ffff800021e7 */
    /*0160*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*0168*/         LDS R4, [R3+0x40];                              R4 = sdata[tid + 16]
    /*0170*/         SSY 0x1a8;                                      

    /*0178*/         NOP;                                            /* 0x4000000000001de4 */

    /*0180*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*0188*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*0190*/     @P0 STSUL [R3], R5;                                 sdata[tid] = R5 (store to shared memory and unlock)
    /*0198*/    @!P0 BRA 0x180;                                      /* 0x4003ffff800021e7 */
    /*01a0*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*01a8*/         LDS R4, [R3+0x20];                              R4 = sdata[tid + 8]
    /*01b0*/         SSY 0x1e8;                                      

    /*01b8*/         NOP;                                            /* 0x4000000000001de4 */

    /*01c0*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*01c8*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*01d0*/     @P0 STSUL [R3], R5;                                 sdata[tid] = R5 (store to shared memory and unlock)
    /*01d8*/    @!P0 BRA 0x1c0;                                      /* 0x4003ffff800021e7 */
    /*01e0*/         NOP.S;                                          /* 0x4000000000001df4 */

    /*01e8*/         LDS R6, [R3+0x10];                              /* 0xc100000040319c85 */
    /*01f0*/         LDS R5, [R3+0x8];                               /* 0xc100000020315c85 */
    /*01f8*/         LDS R4, [R3+0x4];                               /* 0xc100000010311c85 */
    /*0200*/         SSY 0x230;                                      /* 0x60000000a0000007 */
    /*0208*/         LDSLK P0, R7, [R3];                             /* 0xc40000000031dc85 */
    /*0210*/     @P0 IADD R7, R7, R6;                                /* 0x480000001871c003 */
    /*0218*/     @P0 STSUL [R3], R7;                                 /* 0xcc0000000031c085 */
    /*0220*/    @!P0 BRA 0x208;                                      /* 0x4003ffff800021e7 */
    /*0228*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*0230*/         SSY 0x268;                                      /* 0x60000000c0000007 */
    /*0238*/         NOP;                                            /* 0x4000000000001de4 */
    /*0240*/         LDSLK P0, R6, [R3];                             /* 0xc400000000319c85 */
    /*0248*/     @P0 IADD R6, R6, R5;                                /* 0x4800000014618003 */
    /*0250*/     @P0 STSUL [R3], R6;                                 /* 0xcc00000000318085 */
    /*0258*/    @!P0 BRA 0x240;                                      /* 0x4003ffff800021e7 */
    /*0260*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*0268*/         NOP;                                            /* 0x4000000000001de4 */
    /*0270*/         NOP;                                            /* 0x4000000000001de4 */
    /*0278*/         NOP;                                            /* 0x4000000000001de4 */
    /*0280*/         LDSLK P0, R5, [R3];                             /* 0xc400000000315c85 */
    /*0288*/     @P0 IADD R5, R5, R4;                                /* 0x4800000010514003 */
    /*0290*/     @P0 STSUL [R3], R5;                                 /* 0xcc00000000314085 */
    /*0298*/    @!P0 BRA 0x280;                                      /* 0x4003ffff800021e7 */
    /*02a0*/         ISETP.NE.AND.S P0, PT, R2, RZ, PT;              /* 0x1a8e0000fc21dc33 */
    /*02a8*/     @P0 BRA.U 0x2c8;                                    /* 0x40000000600081e7 */
    /*02b0*/    @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;               /* 0x4000400090002043 */
    /*02b8*/    @!P0 LDS R2, [RZ];                                   /* 0xc100000003f0a085 */
    /*02c0*/    @!P0 ST [R0], R2;                                    /* 0x900000000000a085 */
    /*02c8*/         EXIT;                                           /* 0x8000000000001de7 */

and

THE CORRECT VERSION

    Function : _Z18reduce4_atomicWarpIiEvPT_S1_j
.headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_CTAID.X;                             /* 0x2c00000094001c04 */
    /*0010*/         SHL R3, R0, 0x1;                                /* 0x6000c0000400dc03 */
    /*0018*/         S2R R2, SR_TID.X;                               /* 0x2c00000084009c04 */
    /*0020*/         IMAD R3, R3, c[0x0][0x8], R2;                   /* 0x200440002030dca3 */
    /*0028*/         IADD R4, R3, c[0x0][0x8];                       /* 0x4800400020311c03 */
    /*0030*/         ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT;  /* 0x188e4000a031dc03 */
    /*0038*/         ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT;  /* 0x1b0e4000a043dc03 */
    /*0040*/     @P0 ISCADD R3, R3, c[0x0][0x20], 0x2;               /* 0x400040008030c043 */
    /*0048*/    @!P1 ISCADD R4, R4, c[0x0][0x20], 0x2;               /* 0x4000400080412443 */
    /*0050*/    @!P0 MOV R5, RZ;                                     /* 0x28000000fc0161e4 */
    /*0058*/    @!P1 LD R4, [R4];                                    /* 0x8000000000412485 */
    /*0060*/     @P0 LD R5, [R3];                                    /* 0x8000000000314085 */
    /*0068*/         SHL R3, R2, 0x2;                                /* 0x6000c0000820dc03 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/    @!P1 IADD R5, R4, R5;                                /* 0x4800000014416403 */
    /*0080*/         MOV R4, c[0x0][0x8];                            /* 0x2800400020011de4 */
    /*0088*/         STS [R3], R5;                                   /* 0xc900000000315c85 */
    /*0090*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    /* 0x50ee0000ffffdc04 */
    /*0098*/         MOV R6, c[0x0][0x8];                            /* 0x2800400020019de4 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R6, 0x42, PT;          /* 0x188ec0010861dc03 */
    /*00a8*/     @P0 BRA 0x118;                                      /* 0x40000001a00001e7 */
    /*00b0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00b8*/         NOP;                                            /* 0x4000000000001de4 */
    /*00c0*/         MOV R6, R4;                                     /* 0x2800000010019de4 */
    /*00c8*/         SHR.U32 R4, R4, 0x1;                            /* 0x5800c00004411c03 */
    /*00d0*/         ISETP.GE.U32.AND P0, PT, R2, R4, PT;            /* 0x1b0e00001021dc03 */
    /*00d8*/    @!P0 IADD R7, R4, R2;                                /* 0x480000000841e003 */
    /*00e0*/    @!P0 SHL R7, R7, 0x2;                                /* 0x6000c0000871e003 */
    /*00e8*/    @!P0 LDS R7, [R7];                                   /* 0xc10000000071e085 */
    /*00f0*/    @!P0 IADD R5, R7, R5;                                /* 0x4800000014716003 */
    /*00f8*/    @!P0 STS [R3], R5;                                   /* 0xc900000000316085 */
    /*0100*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    __syncthreds()
    /*0108*/         ISETP.GT.U32.AND P0, PT, R6, 0x83, PT;          /* 0x1a0ec0020c61dc03 */
    /*0110*/     @P0 BRA 0xc0;                                       /* 0x4003fffea00001e7 */
    /*0118*/         ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT;          
    /*0120*/         SSY 0x2b8;                                      
    /*0128*/     @P0 BRA 0x2b0;                                      /* 0x40000006000001e7 */
    /*0130*/         LDS R4, [R3+0x80];                              R4 = sdata[tid + 32]
    /*0138*/         SSY 0x168;                                      
    /*0140*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*0148*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*0150*/     @P0 STSUL [R3], R5;                                 sdata[tid] = R5 (store to shared memory and unlock)
    /*0158*/    @!P0 BRA 0x140;                                      /* 0x4003ffff800021e7 */
    /*0160*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*0168*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    __syncthreads()
    /*0170*/         LDS R4, [R3+0x40];                              R4 = sdata[tid + 16]
    /*0178*/         SSY 0x1a8;                                      

    /*0180*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*0188*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*0190*/     @P0 STSUL [R3], R5;                                 sdata[tid] = R5 (store to shared memory and unlock)
    /*0198*/    @!P0 BRA 0x180;                                      /* 0x4003ffff800021e7 */
    /*01a0*/         NOP.S;                                          /* 0x4000000000001df4 */

    /*01a8*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    __syncthreads()
    /*01b0*/         LDS R4, [R3+0x20];                              R4 = sdata[tid + 8]
    /*01b8*/         SSY 0x1e8;                                      
    /*01c0*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*01c8*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*01d0*/     @P0 STSUL [R3], R5;                                 sdata[tid] = R5 (store to shared memory and unlock)
    /*01d8*/    @!P0 BRA 0x1c0;                                      /* 0x4003ffff800021e7 */
    /*01e0*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*01e8*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    __syncthreads()
    /*01f0*/         LDS R4, [R3+0x10];                              R4 = sdata[tid + 4]
    /*01f8*/         SSY 0x228;                                      
    /*0200*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*0208*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*0210*/     @P0 STSUL [R3], R5;                                 R5 = R5 + R4
    /*0218*/    @!P0 BRA 0x200;                                      /* 0x4003ffff800021e7 */
    /*0220*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*0228*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    __syncthreads()
    /*0230*/         LDS R4, [R3+0x8];                               R4 = sdata[tid + 2]
    /*0238*/         SSY 0x268;                                      
    /*0240*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*0248*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*0250*/     @P0 STSUL [R3], R5;                                 sdata[tid] = R5 (store to shared memory and unlock)
    /*0258*/    @!P0 BRA 0x240;                                      /* 0x4003ffff800021e7 */
    /*0260*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*0268*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    __syncthreads()
    /*0270*/         LDS R4, [R3+0x4];                               R4 = sdata[tid + 1]
    /*0278*/         SSY 0x2a8;                                      
    /*0280*/         LDSLK P0, R5, [R3];                             R5 = sdata[tid] (load from shared memory and lock)
    /*0288*/     @P0 IADD R5, R5, R4;                                R5 = R5 + R4
    /*0290*/     @P0 STSUL [R3], R5;                                 sdata[tid] = R5 (store to shared memory and unlock)
    /*0298*/    @!P0 BRA 0x280;                                      /* 0x4003ffff800021e7 */
    /*02a0*/         NOP.S;                                          /* 0x4000000000001df4 */
    /*02a8*/         BAR.RED.POPC RZ, RZ, RZ, PT;                    /* 0x50ee0000ffffdc04 */
    /*02b0*/         ISETP.NE.AND.S P0, PT, R2, RZ, PT;              /* 0x1a8e0000fc21dc33 */
    /*02b8*/     @P0 BRA.U 0x2d8;                                    /* 0x40000000600081e7 */
    /*02c0*/    @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;               /* 0x4000400090002043 */
    /*02c8*/    @!P0 LDS R2, [RZ];                                   /* 0xc100000003f0a085 */
    /*02d0*/    @!P0 ST [R0], R2;                                    /* 0x900000000000a085 */
    /*02d8*/         EXIT;                                           /* 0x8000000000001de7 */

Returning to your real question, by ensuring that the kernels are correctly launched, you can easily verify that atomics have impact on performance.

CUDA reduction: atomic operations not impacting performance?

1 Answers1