After reading the manual of NVIDIA, I wrotea parrell reduction code as follows:
__global__ void kernel(int *devData)
{
__shared__ int sum;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (threadIdx.x == 0)
sum = 0;
__syncthreads();
sum += devData[i];
__syncthreads();
if (threadIdx.x == 0)
printf("sum of block %d is %d\n", blockIdx.x, sum);
}
int main(void)
{
// init device
int devIdx = 0;
cudaError_t err = cudaSuccess;
gpuDeviceInit(devIdx);
int i;
int data[100];
int *devData;
for (i = 0; i < 100; i++)
data[i] = 1;
err = cudaMalloc(&devData, 100 * sizeof(int));
checkCudaErrors(err);
// copy data to device
err = cudaMemcpy(devData, data, 100 * sizeof(int), cudaMemcpyHostToDevice);
checkCudaErrors(err);
int blocksPerGrid = 10;
int threadsPerBlock = 10;
// call kernel function
kernel <<<blocksPerGrid, threadsPerBlock>>> (devData);
checkCudaErrors(cudaGetLastError());
cudaDeviceReset();
return 0;
}
I'm trying to sum integers for each block and then print this sum. But I found the result was as follows:
sum of block 0 is 1
sum of block 6 is 1
sum of block 2 is 1
sum of block 8 is 1
sum of block 1 is 1
sum of block 7 is 1
sum of block 4 is 1
sum of block 3 is 1
sum of block 9 is 1
sum of block 5 is 1
The result I expected was 10.Is the __shared__ variable "sum" shared by every thread in a block? What's wrong with my understanding of "__shared__" variables in cuda?