0
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>

#define NUMBEROFMX 256*64

__global__ void reduce0(int *g_idata, int *g_odata)
{
    extern __shared__ int sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = g_idata[i];
    __syncthreads();


    for (unsigned int s = 1; s < blockDim.x; s *= 2)
    {
        if (tid % (2 * s) == 0)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    if (tid == 0)
    {
        g_odata[blockIdx.x] = sdata[0];
    }
}

int main()
{
    int *A;
    int *B;
    int *dev_A;
    int *dev_B;


    A = (int*)malloc(sizeof(int) * NUMBEROFMX);
    B = (int*)malloc(sizeof(int) * NUMBEROFMX);

    cudaMalloc((void**)&dev_A, sizeof(int)*NUMBEROFMX);
    cudaMalloc((void**)&dev_B, sizeof(int)*NUMBEROFMX);

    for (int i = 0; i < NUMBEROFMX; i++)
    {
        A[i] = 1;
    }

    cudaMemcpy(dev_A, A, sizeof(int)*NUMBEROFMX, cudaMemcpyHostToDevice);
    reduce0 << <256, 64 >> >(dev_A, dev_B);
    cudaMemcpy(B, dev_B, sizeof(int)*NUMBEROFMX, cudaMemcpyDeviceToHost);
    printf("%d\n", B[0]);
}

I study about CUDA programming. This code is not completed but I want to make a code for sum of arrays with tree reduction. I expect to get 64 in B[0] but there the value of B[0] is not valid. When I use NSIGHT to debug this code, g_odata[0] is 64. However B[0] is not valid. I don't know why.

Jeagun
  • 1
  • 1

1 Answers1

2

If you would have used proper CUDA error checking or run your code with cuda-memcheck you would have found an illegal use of shared memory (Out-of-range Shared or Local Address).

For Dynamic Shared Memory, which you are trying to use, you need to specify the size of shared memory per block on kernel invocation:

reduce0<<<256, 64, 64*sizeof(int)>>>(dev_A, dev_B);

With this change your kernel works as expected.

For more background on shared memory see the blog post Using Shared Memory in CUDA C/C++.

Community
  • 1
  • 1
havogt
  • 2,572
  • 1
  • 27
  • 37