C/CUDA Nvidia Dotproduct example gives incorrect result

Question

I'm trying to implement the dotproduct in C/CUDA. I've mostly copied the code from Nvidias tutorial available here: http://www.nvidia.com/content/gtc-2010/pdfs/2131_gtc2010.pdf

The result I want is output

*c     = 44870400
result = 44870400

but I get

*c     = 44608256
result = 44870400

Seems to be that the "511*511 case" is not a part of the calculated result. I've checked the code up and down and I can't even find a synch bug. What am I doing wrong here?

The compile flags are:

cuda_dotp: ./cuda_dotp.cu
    nvcc -arch=sm_13 \
    -o cuda_dotp ./cuda_dotp.cu

and the contents of file cuda_dotp.cu

#include <stdio.h>
#include <cuda.h>

#define N 513
#define THREADS_PER_BLOCK 512

__global__ void dot(int *a, int *b, int *c) {
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    temp[threadIdx.x] = a[index] * b[index];
    if (index >= N) return;

    __syncthreads();
    if(0 == threadIdx.x) {
        int sum = 0;
        int max = THREADS_PER_BLOCK;
        if (N < max) max = N;

        for (int i = 0; i < max; i++) {
            sum += temp[i];
        }
        c[0] = sum;
    }
}

void random_ints(int *a, int size)
{
    int i;
    for (i=0; i<size; i++)
        a[i] = i;
    return;
}

int main(void) {
    int i;
    int result;
    int *a, *b, *c; // host copies of a, b, c
    int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
    int size = N * sizeof(int); // we need space for N ints
    // allocate device copies of a, b, c
    cudaMalloc( (void**)&dev_a, size );
    cudaMalloc( (void**)&dev_b, size );
    cudaMalloc( (void**)&dev_c, sizeof(int) );
    a = (int*)malloc( size );
    b = (int*)malloc( size );
    c = (int*)malloc( sizeof(int) );

    random_ints( a, N );
    random_ints( b, N );
    /*
    printf("a = ");
    for (i=0; i<N; i++) printf("%d, ", a[i]);
    printf("\n");
    printf("b = ");
    for (i=0; i<N; i++) printf("%d, ", b[i]);
    printf("\n");
    */
    result = 0;
    for (i=0; i<N; i++) result += a[i] * b[i];
    *c = 0;

    // copy inputs to device
    cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);

    int blocks = N/THREADS_PER_BLOCK;
    if(blocks<1) blocks=1;

    // launch dot() kernel
    dot <<< blocks, THREADS_PER_BLOCK >>> (dev_a, dev_b, dev_c);

    // copy device result back to host copy of c
    cudaMemcpy(c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);

    printf("*c     = %d\n", *c);
    printf("result = %d\n", result);

    free(a); free(b); free(c);

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}

The worst of the problems in this code would have been completely obvious if you had added correct API status checking (about 60 seconds effort) — talonmies, May 19 '15 at 04:58

score 2 · Answer 1 · answered May 18 '15 at 23:53

There were quite a few errors. Accessing arrays outside of allocated space, running 0 blocks ( (int)10/(int)512 = 0), not initializing c before adding to it in kernel.

Compare your code with the following.

#include <stdio.h>
#include <cuda.h>

#define N 10
#define THREADS_PER_BLOCK 512

__global__ void dot(int *a, int *b, int *c) {
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    temp[threadIdx.x] = a[index] * b[index];
    if(index>=N) return;

    __syncthreads();
    if(0 == threadIdx.x) {
        int sum = 0;
        int max= THREADS_PER_BLOCK;
        if(N<max)max=N;

        for(int i = 0; i < max; i++){
            sum += temp[i];
        }
        c[0]=sum;
    }
}

void random_ints(int *a, int size)
{
    int i;
    for (i=0; i<size; i++)
        a[i] = i;
    return;
}

int main(void) {
    int i;
    int *a, *b, *c; // host copies of a, b, c
    int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
    int size = N * sizeof(int); // we need space for N ints
    // allocate device copies of a, b, c
    cudaMalloc( (void**)&dev_a, size );
    cudaMalloc( (void**)&dev_b, size );
    cudaMalloc( (void**)&dev_c, sizeof(int) );
    a = (int*)malloc( size );
    b = (int*)malloc( size );
    c = (int*)malloc( sizeof(int) );

    random_ints( a, N );
    random_ints( b, N );
    printf("a = ");
    for (i=0; i<N; i++) printf("%d, ", a[i]);
    printf("\n");
    printf("b = ");
    for (i=0; i<N; i++) printf("%d, ", b[i]);
    printf("\n");
    *c = 0;

    // copy inputs to device
    cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);

    int blocks = N/THREADS_PER_BLOCK;
    if(blocks<1) blocks=1;

    // launch dot() kernel
    dot<<< blocks,THREADS_PER_BLOCK>>>( dev_a, dev_b, dev_c);


    // copy device result back to host copy of c
    cudaMemcpy(c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);

    printf("*c = %d\n", *c);

    free(a); free(b); free(c);

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);


    return 0;
}

This solved it for `N <= 512`. However when `N > THREADS_PER_BLOCK` then it fails to produce the correct result. I think this has to do with `blocks` but, I tried adding a 1 to `blocks` so that there will be enough many blocks and threads to cover the whole array but no success. I'll leave the updated code in an edited version of the question. — neckutrek, May 19 '15 at 09:39
Currently your code works for a block. You need to re-generalize it to multiple blocks. This includes (1) fixing the number of blocks as you did, (2) doing proper array bounds checking (fix your out of access errors when N>512 and blocks > 1, particularly with your 0 thread), (3) go back to using atomic add (remember junk + correct answer still equals junk). I have confidence you can take it from here. — Christian Sarofeen, May 19 '15 at 12:07
Thanks for your help, I managed to solve it. I'll post the solution as an answer to the question for later reference. — neckutrek, May 19 '15 at 15:01

score 2 · Answer 2 · answered May 19 '15 at 15:02

Finally this solved the problem. The changes can be found in the comments of the above post.

#include <stdio.h>
#include <cuda.h>

#define N 4096
#define THREADS_PER_BLOCK 512

__global__ void dot(int *a, int *b, int *c) {
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index >= N) return;
    temp[threadIdx.x] = a[index] * b[index];

    __syncthreads();
    if (0 == threadIdx.x) {
        int sum = 0;
        int max = THREADS_PER_BLOCK;
        if (N < max) max = N;

        for (int i = 0; i < max; i++) {
            sum += temp[i];
        }
        //c[0] = sum;
        atomicAdd(c, sum);
    }
}

void random_ints(int *a, int size)
{
    int i;
    for (i=0; i<size; i++)
        a[i] = i;
    return;
}

int main(void) {
    int i;
    int result;
    int *a, *b, *c; // host copies of a, b, c
    int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
    int size = N * sizeof(int); // we need space for N ints
    // allocate device copies of a, b, c
    cudaMalloc( (void**)&dev_a, size );
    cudaMalloc( (void**)&dev_b, size );
    cudaMalloc( (void**)&dev_c, sizeof(int) );
    a = (int*)malloc( size );
    b = (int*)malloc( size );
    c = (int*)malloc( sizeof(int) );

    random_ints( a, N );
    random_ints( b, N );
    /*
    printf("a = ");
    for (i=0; i<N; i++) printf("%d, ", a[i]);
    printf("\n");
    printf("b = ");
    for (i=0; i<N; i++) printf("%d, ", b[i]);
    printf("\n");
    */
    result = 0;
    for (i=0; i<N; i++) result += a[i] * b[i];
    *c = 0;

    // copy inputs to device
    cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);

    int blocks = (int)(N/THREADS_PER_BLOCK) + 1; // ceil(...)
    //if(blocks<1) blocks=1;

    // launch dot() kernel
    dot <<< blocks, THREADS_PER_BLOCK >>> (dev_a, dev_b, dev_c);

    // copy device result back to host copy of c
    cudaMemcpy(c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);

    printf("*c     = %d\n", *c);
    printf("result = %d\n", result);

    free(a); free(b); free(c);

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}

C/CUDA Nvidia Dotproduct example gives incorrect result

2 Answers2