Min reduction cuda does not work

Question

I wrote a code to find the minimum by reduction. However the result is always zero. I don't know what is the problem. Please help me.

Here it is the kernel code: I modified the sum reduction code from the Nvidia.

#include <limits.h>

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int* In, int* Out, int n){
  __shared__ int sdata[NumThread];
  unsigned int i = blockIdx.x * NumThread + threadIdx.x;
  unsigned int tid = threadIdx.x;
  unsigned int gridSize = NumBlock * NumThread;
  int myMin = INT_MAX;

  while (i < n){
    if(In[i] < myMin)
    myMin = In[i];
    i += gridSize;
  }
  sdata[tid] = myMin;
  __syncthreads();

  if (NumThread >= 1024){
    if (tid < 512)
    if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
    __syncthreads();
  }
  if (NumThread >= 512){
    if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
    __syncthreads();
  }
  if (NumThread >= 256){
    if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] =  sdata[tid + 128];
    __syncthreads();
  }
  if (NumThread >= 128){
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }
  //the following practice is deprecated
   if (tid < 32){
    volatile int *smem = sdata;
    if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] =  smem[tid+32];
    if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] =  smem[tid+16];
    if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] =  smem[tid+8];
    if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] =  smem[tid+4];
    if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] =  smem[tid+2];
    if (NumThread >= 2) if(smem[tid] > smem[tid + 1] )      smem[tid] =  smem[tid+1];
  }
  if (tid == 0)
    if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
    else Out[blockIdx.x] = sdata[1];      
}

And here it is my main code:

#include <stdio.h>
#include <stdlib.h>

#include "min_reduction.cu"

int main(int argc, char* argv[]){
  unsigned int length = 1048576;
  int i, Size, min;
  int *a, *out, *gpuA, *gpuOut;

  cudaSetDevice(0);
  Size = length * sizeof(int);
  a = (int*)malloc(Size);
  out = (int*)malloc(NumBlock*sizeof(int));
  for(i=0;i<length;i++) a[i] = (i + 10);

  cudaMalloc((void**)&gpuA,Size);
  cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
  cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
  min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
  cudaDeviceSynchronize();
  cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);

  min = out[0];
  for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];
  return 0;
}

score 3 · Accepted Answer · edited May 23 '17 at 11:45

I'm not sure I agree with everything that @HubertApplebaum said, but I can agree with the suggestion to use proper cuda error checking. And as you mention in the code, warp synchronous programming can be considered to be deprecated but I cannot support the claim that it is broken (yet). However I don't wish to argue about that; it's not central to your question here.

Another useful debugging suggestion would be to follow the steps here to compile your code with -lineinfo and run your code with cuda-memcheck. If you did that, you would see many reports like this:

========= Invalid __shared__ read of size 4
=========     at 0x000001e0 in /home/bob/misc/t1074.cu:39:min_reduce(int*, int*, int)
=========     by thread (64,0,0) in block (24,0,0)
=========     Address 0x00000200 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15859d]
=========     Host Frame:./t1074 [0x16dc1]
=========     Host Frame:./t1074 [0x315d3]
=========     Host Frame:./t1074 [0x28f5]
=========     Host Frame:./t1074 [0x2623]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1074 [0x271d]

which would indicate both that a primary problem in your code is that you are incorrectly indexing into your __shared__ memory array as well as the specific line of code where that is taking place. Neat! (It's line 39 in my case, but it would be a different line probably in your case). If you then drill into that line, you will want to study this section of code:

  #define NumThread 128
  ...
  __shared__ int sdata[NumThread];
  ...
  if (NumThread >= 128){
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64]; //line 39 in my case
    __syncthreads();
  }

You have defined NumThread at 128, and have statically allocated a shared memory array of that many int quantities. All well and good. What about the code in the if-statement? That if-condition will be satisfied, which means that all 128 threads in the block will execute the body of that if-statement. However, you are reading sdata[tid + 64] from shared memory, and for threads whose tid is greater than 63 (i.e. half of the threads in each block), this will generate an index into shared memory of greater than 127 (which is out-of-bounds, i.e. illegal).

The fix (for this specific code that you have shown) is fairly simple, just add another if-test:

  if (NumThread >= 128){
    if (tid < 64)
      if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }

If you make that modification to your code, and rerun the cuda-memcheck test, you'll see that all the runtime-reported errors are gone. Yay!

But the code still doesn't produce the right answer yet. You've made another error here:

  for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];

If you want to find the minimum value, and think about that logic carefully, you'll realize you should have done this:

  for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
                                 ^
                                 |
                              greater than

With those two changes, your code produces the correct result for me:

$ cat t1074.cu
#include <stdio.h>
#include <stdlib.h>


#include <limits.h>

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int* In, int* Out, int n){
  __shared__ int sdata[NumThread];
  unsigned int i = blockIdx.x * NumThread + threadIdx.x;
  unsigned int tid = threadIdx.x;
  unsigned int gridSize = NumBlock * NumThread;
  int myMin = INT_MAX;

  while (i < n){
    if(In[i] < myMin)
    myMin = In[i];
    i += gridSize;
  }
  sdata[tid] = myMin;
  __syncthreads();

  if (NumThread >= 1024){
    if (tid < 512)
    if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
    __syncthreads();
  }
  if (NumThread >= 512){
    if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
    __syncthreads();
  }
  if (NumThread >= 256){
    if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] =  sdata[tid + 128];
    __syncthreads();
  }
  if (NumThread >= 128){
    if (tid < 64)
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }
  //the following practice is deprecated
   if (tid < 32){
    volatile int *smem = sdata;
    if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] =  smem[tid+32];
    if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] =  smem[tid+16];
    if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] =  smem[tid+8];
    if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] =  smem[tid+4];
    if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] =  smem[tid+2];
    if (NumThread >= 2) if(smem[tid] > smem[tid + 1] )      smem[tid] =  smem[tid+1];
  }
  if (tid == 0)
    if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
    else Out[blockIdx.x] = sdata[1];
}

int main(int argc, char* argv[]){
  unsigned int length = 1048576;
  int i, Size, min;
  int *a, *out, *gpuA, *gpuOut;

  cudaSetDevice(0);
  Size = length * sizeof(int);
  a = (int*)malloc(Size);
  out = (int*)malloc(NumBlock*sizeof(int));
  for(i=0;i<length;i++) a[i] = (i + 10);
  a[10]=5;
  cudaMalloc((void**)&gpuA,Size);
  cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
  cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
  min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
  cudaDeviceSynchronize();
  cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);

  min = out[0];
  for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
  printf("min = %d\n", min);
  return 0;
}
$ nvcc -o t1074 t1074.cu
$ cuda-memcheck ./t1074
========= CUDA-MEMCHECK
min = 5
========= ERROR SUMMARY: 0 errors
$

Note that you already have the if-check in the 1024 threads case, you may want to add an appropriate if-check to the 512 and 256 threads case, just as I have added it for the 128 threads case above.

Thank you for your answer. The code works fine, however when I change the input array a , and add this line a[10] = 5; after initialization. The code, can not find the minimum? Can you tell me what is wrong with my code?? — Hamid_UMB, Feb 12 '16 at 18:08
I added a[10]=5; after initialization to the code I posted in my answer and it seems to work fine. (I've updated the full test case in my answer to show that.) Perhaps you should ask a new question with your new code that is not working. Did you add proper cuda error checking and run your code with `cuda-memcheck` ? — Robert Crovella, Feb 12 '16 at 18:16

Min reduction cuda does not work

1 Answers1

Linked