#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define NUMBEROFMX 256*64
__global__ void reduce0(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
if (tid % (2 * s) == 0)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0)
{
g_odata[blockIdx.x] = sdata[0];
}
}
int main()
{
int *A;
int *B;
int *dev_A;
int *dev_B;
A = (int*)malloc(sizeof(int) * NUMBEROFMX);
B = (int*)malloc(sizeof(int) * NUMBEROFMX);
cudaMalloc((void**)&dev_A, sizeof(int)*NUMBEROFMX);
cudaMalloc((void**)&dev_B, sizeof(int)*NUMBEROFMX);
for (int i = 0; i < NUMBEROFMX; i++)
{
A[i] = 1;
}
cudaMemcpy(dev_A, A, sizeof(int)*NUMBEROFMX, cudaMemcpyHostToDevice);
reduce0 << <256, 64 >> >(dev_A, dev_B);
cudaMemcpy(B, dev_B, sizeof(int)*NUMBEROFMX, cudaMemcpyDeviceToHost);
printf("%d\n", B[0]);
}
I study about CUDA programming. This code is not completed but I want to make a code for sum of arrays with tree reduction. I expect to get 64 in B[0] but there the value of B[0] is not valid. When I use NSIGHT to debug this code, g_odata[0] is 64. However B[0] is not valid. I don't know why.