I implemented the reduction#1 form the well-known slides by Mark Harris, but I obtain 0 as result. I filled the input array with the same values shown in the slides. I compiled with cuda 7.0 using the command nvcc reduction1.cu -o red1. Where is the mistake? Thanks.
#include <stdio.h>
#include <cuda_runtime.h>
#define THREADS_PER_BLOCK 16
__global__ void reduce1(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
if (tid % (2*s) == 0) sdata[tid] += sdata[tid + s];
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main()
{
int inputLength=16;
int hostInput[16]={10,1,8,-1,0,-2,3,5,-2,-3,2,7,0,11,0,2};
int hostOutput=0;
int *deviceInput;
int *deviceOutput;
cudaMalloc((void **)&deviceInput, inputLength * sizeof(int));
cudaMalloc((void **)&deviceOutput, sizeof(int));
cudaMemcpy(deviceInput, hostInput, inputLength * sizeof(int),cudaMemcpyHostToDevice);
reduce1<<<1,THREADS_PER_BLOCK>>>(deviceInput, deviceOutput);
cudaDeviceSynchronize();
cudaMemcpy(&hostOutput, deviceOutput,sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n",hostOutput);
cudaFree(deviceInput);
cudaFree(deviceOutput);
return 0;
}