In previous post here, I asked about how to calculate sum of an array with reduction. Now I have a new problem, with larger image, my result is not correct, it change every time I run. I tested with 96*96 image size array sample
First time result: 28169.046875
Second time result: 28169.048828
Expected result: 28169.031250
Here is my code:
#include <stdio.h>
#include <cuda.h>
__global__ void calculate_threshold_kernel(float * input, float * output)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int t = threadIdx.x;
__shared__ float partialSum[256];
partialSum[t] = input[idx];
__syncthreads();
for (int stride = 1; stride < blockDim.x; stride *= 2)
{
if (t % (2 * stride) == 0)
partialSum[t] += partialSum[t + stride];
__syncthreads();
}
if (t == 0)
{
atomicAdd(output,partialSum[0]);
}
}
int main( void )
{
float *d_array, *d_output,*h_input, *h_output;
int img_height = 96;
int img_width = 96;
int input_elements = img_height * img_width;
h_input = (float*) malloc(sizeof(float) * input_elements);
cudaMalloc((void**)&d_output, sizeof(float));
cudaMemset(d_output, 0, sizeof(float));
h_output = (float*)malloc(sizeof(float));
cudaMalloc((void**)&d_array, input_elements*sizeof(float));
float array[] = {[array sample]};
for (int i = 0; i < input_elements; i++)
{
h_input[i] = array[i];
}
cudaMemcpy(d_array, h_input, input_elements*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(256);
dim3 gridsize(input_elements/blocksize.x);
calculate_threshold_kernel<<<gridsize,blocksize>>>(d_array, d_output);
cudaMemcpy(h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost);
printf("Sum from GPU = %f\n", *h_output);
return 0;
}