-1

I'm newbie in CUDA C... I want to sum elements of array(with reduce) in 1 block and 267 threads use the shared memory. I read a book "CUDA by example, an introdution to General-Purpose to GPU Programming". According to some recomendations from her, i write my version of program:

__global__ void
conva(int* a, int* out) 
{
    __shared__ int cache[534];
    int cacheIndex = threadIdx.x;
    for(int n=0; n<2;++n) {
        cache[cacheIndex+n] = a[cacheIndex+n];
        int i = blockDim.x/2;
        while (i != 0) {
            if (cacheIndex < i)
                cache[cacheIndex + n] += cache[cacheIndex + n + i];
            __syncthreads();
            i /= 2;
        } 
    }
    //need or not this __syncthreads(), I don't know
    __syncthreads();
     if (cacheIndex == 0)
         out = &cache[0];
}

int main(int argc, char** argv)
{
    //enter array for sum
    int convolution[534];
    for(int i=0; i<534; ++i) 
        convolution[i] = 1;
    //variable in which we take a sum from device
    int summa = 0;
    //it we copy on device from host
    int* tash;

    int* convolution_gpu;
    cudaMalloc((void**)(&convolution_gpu), 534*sizeof(int));
    cudaMalloc((void**)(&tash), sizeof(int));

    cudaMemcpy(convolution_gpu, convolution, 534*sizeof(int),  cudaMemcpyHostToDevice );
    //call core with 1 block and 267 threads
    conva<<<1, 267>>>(convolution_gpu, tash);

    cudaMemcpy(&summa, tash, sizeof(int), cudaMemcpyDeviceToHost);
    //and here I want 534 but I have garbage(may be)
    std::cout<<summa<<std::endl;

    cudaFree(convolution_gpu);
    cudaFree(tash);
    getchar();
}

Tell please, where here is error and help me to resolve her... (sorry for my english)

talonmies
  • 70,661
  • 34
  • 192
  • 269
xperious
  • 239
  • 3
  • 10

1 Answers1

1

In your kernel, this:

 if (cacheIndex == 0)
     out = &cache[0];

is almost certainly wrong. Surely you want something like:

 if (cacheIndex == 0)
     *out = cache[0];
talonmies
  • 70,661
  • 34
  • 192
  • 269
  • 1
    There are several other problems with your reduction code as well. (For example, you are launching an odd number of threads, and your reduction method cannot handle an odd threadblock size). The method doesn't make sense to me and I don't think it will be particularly efficient, so instead I suggest you study [this material](https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf) on reduction, to learn how to do a fast/efficient one. – Robert Crovella Feb 06 '16 at 15:57