I'm not sure I agree with everything that @HubertApplebaum said, but I can agree with the suggestion to use proper cuda error checking. And as you mention in the code, warp synchronous programming can be considered to be deprecated but I cannot support the claim that it is broken (yet). However I don't wish to argue about that; it's not central to your question here.
Another useful debugging suggestion would be to follow the steps here to compile your code with -lineinfo
and run your code with cuda-memcheck
. If you did that, you would see many reports like this:
========= Invalid __shared__ read of size 4
========= at 0x000001e0 in /home/bob/misc/t1074.cu:39:min_reduce(int*, int*, int)
========= by thread (64,0,0) in block (24,0,0)
========= Address 0x00000200 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15859d]
========= Host Frame:./t1074 [0x16dc1]
========= Host Frame:./t1074 [0x315d3]
========= Host Frame:./t1074 [0x28f5]
========= Host Frame:./t1074 [0x2623]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
========= Host Frame:./t1074 [0x271d]
which would indicate both that a primary problem in your code is that you are incorrectly indexing into your __shared__
memory array as well as the specific line of code where that is taking place. Neat! (It's line 39 in my case, but it would be a different line probably in your case). If you then drill into that line, you will want to study this section of code:
#define NumThread 128
...
__shared__ int sdata[NumThread];
...
if (NumThread >= 128){
if(sdata[tid] > sdata[tid + 64] ) sdata[tid] = sdata[tid + 64]; //line 39 in my case
__syncthreads();
}
You have defined NumThread
at 128, and have statically allocated a shared memory array of that many int
quantities. All well and good. What about the code in the if-statement? That if-condition will be satisfied, which means that all 128 threads in the block will execute the body of that if-statement. However, you are reading sdata[tid + 64]
from shared memory, and for threads whose tid
is greater than 63 (i.e. half of the threads in each block), this will generate an index into shared memory of greater than 127 (which is out-of-bounds, i.e. illegal).
The fix (for this specific code that you have shown) is fairly simple, just add another if-test:
if (NumThread >= 128){
if (tid < 64)
if(sdata[tid] > sdata[tid + 64] ) sdata[tid] = sdata[tid + 64];
__syncthreads();
}
If you make that modification to your code, and rerun the cuda-memcheck
test, you'll see that all the runtime-reported errors are gone. Yay!
But the code still doesn't produce the right answer yet. You've made another error here:
for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];
If you want to find the minimum value, and think about that logic carefully, you'll realize you should have done this:
for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
^
|
greater than
With those two changes, your code produces the correct result for me:
$ cat t1074.cu
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#define NumThread 128
#define NumBlock 32
__global__ void min_reduce(int* In, int* Out, int n){
__shared__ int sdata[NumThread];
unsigned int i = blockIdx.x * NumThread + threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int gridSize = NumBlock * NumThread;
int myMin = INT_MAX;
while (i < n){
if(In[i] < myMin)
myMin = In[i];
i += gridSize;
}
sdata[tid] = myMin;
__syncthreads();
if (NumThread >= 1024){
if (tid < 512)
if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
__syncthreads();
}
if (NumThread >= 512){
if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
__syncthreads();
}
if (NumThread >= 256){
if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] = sdata[tid + 128];
__syncthreads();
}
if (NumThread >= 128){
if (tid < 64)
if(sdata[tid] > sdata[tid + 64] ) sdata[tid] = sdata[tid + 64];
__syncthreads();
}
//the following practice is deprecated
if (tid < 32){
volatile int *smem = sdata;
if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] = smem[tid+32];
if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] = smem[tid+16];
if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] = smem[tid+8];
if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] = smem[tid+4];
if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] = smem[tid+2];
if (NumThread >= 2) if(smem[tid] > smem[tid + 1] ) smem[tid] = smem[tid+1];
}
if (tid == 0)
if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
else Out[blockIdx.x] = sdata[1];
}
int main(int argc, char* argv[]){
unsigned int length = 1048576;
int i, Size, min;
int *a, *out, *gpuA, *gpuOut;
cudaSetDevice(0);
Size = length * sizeof(int);
a = (int*)malloc(Size);
out = (int*)malloc(NumBlock*sizeof(int));
for(i=0;i<length;i++) a[i] = (i + 10);
a[10]=5;
cudaMalloc((void**)&gpuA,Size);
cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
cudaDeviceSynchronize();
cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);
min = out[0];
for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
printf("min = %d\n", min);
return 0;
}
$ nvcc -o t1074 t1074.cu
$ cuda-memcheck ./t1074
========= CUDA-MEMCHECK
min = 5
========= ERROR SUMMARY: 0 errors
$
Note that you already have the if-check in the 1024 threads case, you may want to add an appropriate if-check to the 512 and 256 threads case, just as I have added it for the 128 threads case above.