0

I am running a vector addition code written in cuda. Everything is fine about the code, but the problem comes if I increase the vector size. The number of errors (the difference in result given by CPU and GPU) becomes too large. I am attaching the code below:

#include <stdio.h>
#include <stdlib.h>

#include "cuda_utils.h"

#include "timer.h"

/*
 * **CUDA KERNEL** 
 * 
 * Compute the sum of two vectors 
 *   C[i] = A[i] + B[i]
 * 
 */
__global__ void vecAdd(float* a, float* b, float* c) {

  /* Calculate index for this thread */
  int i = blockIdx.x * blockDim.x + threadIdx.x;

  /* Compute the element of C */
  c[i] = a[i] + b[i];
}

void compute_vec_add(int N, float *a, float* b, float *c);

/*
 * 
 * Host code to drive the CUDA Kernel
 * 
 */
int main() {

  float *d_a, *d_b, *d_c;
  float *h_a, *h_b, *h_c, *h_temp;
  int i;
  int N = 1024 * 1024 * 512;

  struct stopwatch_t* timer = NULL;
  long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;

  /* Setup timers */
  stopwatch_init();
  timer = stopwatch_create();

  /*
   Create the vectors
   */
  h_a = (float *) malloc(sizeof(float) * N);
  h_b = (float *) malloc(sizeof(float) * N);
  h_c = (float *) malloc(sizeof(float) * N);

  /*
   Set the initial values of h_a, h_b, and h_c
   */
  for (i = 0; i < N; i++) {
    h_a[i] = (float) (rand() % 100) / 10.0;
    h_b[i] = (float) (rand() % 100) / 10.0;
    h_c[i] = 0.0;
  }

  /*
   Allocate space on the GPU
   */
  CUDA_CHECK_ERROR(cudaMalloc(&d_a, sizeof(float) * N));
  CUDA_CHECK_ERROR(cudaMalloc(&d_b, sizeof(float) * N));
  CUDA_CHECK_ERROR(cudaMalloc(&d_c, sizeof(float) * N));

  /*
   Copy d_a and d_b from CPU to GPU
   */
  stopwatch_start(timer);
  CUDA_CHECK_ERROR(
      cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice));
  CUDA_CHECK_ERROR(
      cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice));
  t_pcie_htd = stopwatch_stop(timer);
  fprintf(stderr, "Time to transfer data from host to device: %Lg secs\n",
          t_pcie_htd);

  /*
   Run N/256 blocks of 256 threads each
   */
  dim3 GS(N / 256, 1, 1);
  dim3 BS(256, 1, 1);

  stopwatch_start(timer);
  vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
  cudaThreadSynchronize();
  t_kernel = stopwatch_stop(timer);
  fprintf(stderr, "Time to execute GPU kernel: %Lg secs\n", t_kernel);

  /*
   Copy d_cfrom GPU to CPU
   */
  stopwatch_start(timer);
  CUDA_CHECK_ERROR(
      cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
  t_pcie_dth = stopwatch_stop(timer);
  fprintf(stderr, "Time to transfer data from device to host: %Lg secs\n",
          t_pcie_dth);

  /* 
   Double check errors
   */
  h_temp = (float *) malloc(sizeof(float) * N);
  stopwatch_start(timer);
  compute_vec_add(N, h_a, h_b, h_temp);
  t_cpu = stopwatch_stop(timer);
  fprintf(stderr, "Time to execute CPU program: %Lg secs\n", t_cpu);

  int cnt = 0;
  for (int i = 0; i < N; i++) {
    if (abs(h_temp[i] - h_c[i]) > 1e-5)
      cnt++;
  }
  fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);

  /*
   Free the device memory
   */
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  /*
   Free the host memory
   */
  free(h_a);
  free(h_b);
  free(h_c);

  /* 
   Free timer 
   */
  stopwatch_destroy(timer);

  if (cnt == 0) {
    printf("\n\nSuccess\n");
  }
}

void compute_vec_add(int N, float *a, float* b, float *c) {
  int i;
  for (i = 0; i < N; i++)
    c[i] = a[i] + b[i];
}

Edit: This is how I am compiling

nvcc vecAdd.cu timer.o

The output of the above code when we run it on GTX TITAN X is following:

Timer: gettimeofday
Timer resolution: ~ 1 us (?)
Time to transfer data from host to device: 1.44104 secs
Time to execute GPU kernel: 0.000121 secs
Time to transfer data from device to host: 0.725893 secs
Time to execute CPU program: 2.96071 secs
number of errors: 350576933 out of 536870912

Also, why it takes 0.72 seconds to transfer around 2GB of data from device to host or 1.44 sec to transfer ~4GB of data from host to device inspite of high bandwidth connection between CPU and GPU. Thank You.

amritkrs
  • 632
  • 7
  • 12
  • 2
    You have incomplete error checking around the kernel launch which means that you will not be detecting the (almost certain) kernel launch failure at large data sizes. Also, `cudaThreadSynchronize` is long deprecated and you should use `cudaDeviceSynchronize` instead. Please edit your question to include the compile statements used to compile this code. – talonmies Jul 08 '16 at 07:50
  • @talonmies I have added the compile statements. Also, I tried cudaDeviceSynchronize but it didn't help me. I don't know about error checking while launching kernel. I will look up. Thanks – amritkrs Jul 08 '16 at 15:09
  • 2
    Your compile command compiles for compute capability 2.0 (the default, since you haven't specified any arch switches). Compute capability 2.0 codes are limited to 65535 blocks in the grid X dimension. Your code is exceeding that here: `dim3 GS(N / 256, 1, 1);`, so your kernel is not launching. Since you have incomplete error checking, you don't get any indication of this. Use [proper cuda error checking](http://stackoverflow.com/questions/14038589). You may be able to fix this by compiling with `-arch=sm_52` to match your Titan X GPU. – Robert Crovella Jul 08 '16 at 15:13
  • 2
    Also, `N*sizeof(float)` exceeds what can be stored in an `int` variable. Therefore I would suggest that you change `int N = ...` to `size_t N = ...`. Finally, your kernel has no thread check, so be sure that whatever sizes of `N` you test with are evenly divisible by 256. – Robert Crovella Jul 08 '16 at 15:17

1 Answers1

2

To summarise, there are a number of problems here:

  1. You are compiling for the default architecture (sm_20), which restricts your kernel grid to 65535 blocks in the x dimension. At large array sizes, you are requesting too large a grid size and the kernel never runs.

Fix this as follows:

nvcc -arch=sm_52 vecAdd.cu timer.o
  1. You lack any error checking around the kernel launch, so you are not detecting the kernel launch failure.

Fix this as follow:

vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
CUDA_CHECK_ERROR(cudaPeekAtLastError());
CUDA_CHECK_ERROR(cudaDeviceSynchronize());
  1. At large problem dimensions, the signed int you are using to calculate the size of memory allocations can overflow, leading to undefined results. You should use size_t instead.

Fix this as follows:

size_t N = .....;
size_t sz = N * sizeof(float);
CUDA_CHECK_ERROR(cudaMalloc(&d_a, sz));
// etc
talonmies
  • 70,661
  • 34
  • 192
  • 269