3

I was doing a cuda tutorial in which I have to make the dot product of two vectors. After implementing the solution provided in the tutorial I came across some issues that were solved in this stack overflow post. Now I am receiving the answer 0 regardless what I do. Bellow you can find the code!

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_atomic_functions.h"
#include <stdio.h>
#include <stdlib.h>
#define N (2048 * 8)
#define THREADS_PER_BLOCK 512

__global__ void dot(int *a, int *b, int *c)
{
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    temp[threadIdx.x] = a[index] * b[index];

    __syncthreads();

    if (threadIdx.x == 0)
    {
        int sum = 0;
        for (int i = 0; i < N; i++)
        {
            sum += temp[i];
        }
        atomicAdd(c, sum);
    }
}

int main()
{
    int *a, *b, *c;
    int *dev_a, *dev_b, *dev_c;
    int size = N * sizeof(int);

   //allocate space for the variables on the device
    cudaMalloc((void **)&dev_a, size);
    cudaMalloc((void **)&dev_b, size);
    cudaMalloc((void **)&dev_c, sizeof(int));

   //allocate space for the variables on the host
   a = (int *)malloc(size);
   b = (int *)malloc(size);
   c = (int *)malloc(sizeof(int));

   //this is our ground truth
   int sumTest = 0;
   //generate numbers
   for (int i = 0; i < N; i++)
   {
       a[i] = rand() % 10;
       b[i] = rand() % 10;
       sumTest += a[i] * b[i];
       printf(" %d %d \n",a[i],b[i]);
   }

   *c = 0;

   cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
   cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
   cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);

   dot<<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >> >(dev_a, dev_b,    dev_c);

   cudaMemcpy(c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);

   printf("%d ", *c);
   printf("%d ", sumTest);

   free(a);
   free(b);
   free(c);

   cudaFree(a);
   cudaFree(b);
   cudaFree(c);

   system("pause");

   return 0;

 }
Community
  • 1
  • 1
Mircea Paul Muresan
  • 628
  • 1
  • 9
  • 23

1 Answers1

3

First of all, please add CUDA error checking in the code as described in this legendary post.

Just before the kernel execution call, you are copying extra memory into dev_c in the following line:

cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);

It should be:

cudaMemcpy(dev_c, c, sizeof(int), cudaMemcpyHostToDevice);

Another error in the code is that inside the kernel, __shared__ memory variable temp is being accessed out of bounds in the for loop. Number of elements of the shared memory is equal to THREADS_PER_BLOCK while the loop is being iterated upto N. Just replace N with THREADS_PER_BLOCK in the loop.

Community
  • 1
  • 1
sgarizvi
  • 16,623
  • 9
  • 64
  • 98