In the below code, I am trying to implement a simple parallel reduction with blocksize and number of threads per block being 1024. However, after implementing partial reduction, I wish to see whether my implementation is going right or not and in that process I make the program print the first element of the host memory (after data has been copied from device memory to host memory). My host memory is initialize with '1' and is copied to device memory for reduction. And the printf statement after the reduction process still gives me '1' at the first element of the array.
Is there a problem in what I am getting to print or is it something logical in the implementation of reduction? In addition printf statements in the kernel do not print anything. Is there something wrong in my syntax or the call to the printf statement? My code is as below:
ifndef CUDACC
define CUDACC
endif
include "cuda_runtime.h"
include "device_launch_parameters.h"
include
include
ifndef THREADSPERBLOCK
define THREADSPERBLOCK 1024
endif
ifndef NUMBLOCKS
define NUMBLOCKS 1024
endif
global void reduceKernel(int *c)
{
extern shared int sh_arr[];
int index = blockDim.x*blockIdx.x + threadIdx.x;
int sh_index = threadIdx.x;
// Storing data from Global memory to shared Memory
sh_arr[sh_index] = c[index];
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0 ; i>>=1)
{
if(sh_index < i){
sh_arr[sh_index] += sh_arr[i+sh_index];
}
__syncthreads();
}
if(sh_index ==0)
c[blockIdx.x]=sh_arr[sh_index];
printf("value stored at %d is %d \n", blockIdx.x, c[blockIdx.x]);
return;
}
int main()
{
int *h_a;
int *d_a;
int share_memSize, h_memSize;
size_t d_memSize;
share_memSize = THREADSPERBLOCK*sizeof(int);
h_memSize = THREADSPERBLOCK*NUMBLOCKS;
h_a = (int*)malloc(sizeof(int)*h_memSize);
d_memSize=THREADSPERBLOCK*NUMBLOCKS;
cudaMalloc( (void**)&d_a, h_memSize*sizeof(int));
for(int i=0; i<h_memSize; i++)
{
h_a[i]=1;
};
//printf("last element of array %d \n", h_a[h_memSize-1]);
cudaMemcpy((void**)&d_a, (void**)&h_a, h_memSize, cudaMemcpyHostToDevice);
reduceKernel<<<NUMBLOCKS, THREADSPERBLOCK, share_memSize>>>(d_a);
cudaMemcpy((void**)&h_a, (void**)&d_a, d_memSize, cudaMemcpyDeviceToHost);
printf("sizeof host memory %d \n", d_memSize); //sizeof(h_a));
printf("sum after reduction %d \n", h_a[0]);
}