Cuda version not working while serial working

Question

my following minimalist Cuda code returns an incorrect result (all polygons have 0 vertices at the end) while the same code running in serial in C++ is working well. The problem is embarrassingly parallel : no communication, no syncthreads etc., and the Cuda memory allocations are sucessful. Even my dummy variable that stores the content of the input array for debug purpose is 0 for the Cuda version. There is no access out of bounds since my arrays are largely large enough. Replacing the memcpy by a loop in Cuda doesn't change anything.
I really don't understand what happens... any idea ? Thanks!

Cuda code:

    #include <stdio.h>
    #include <iostream>
    #include <stdlib.h>
    #include <cuda.h>

    class Point2D {
     public:
     __device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
     double x, y;
    };

    __device__ double dot(const Point2D &A, const Point2D &B) {
     return A.x*B.x + A.y*B.y;
    }
    __device__ Point2D operator*(double a, const Point2D &P) {
     return Point2D(a*P.x, a*P.y);
    }
    __device__ Point2D operator+(Point2D A, const Point2D &B) {
     return Point2D(A.x + B.x, A.y + B.y);
    }
    __device__ Point2D operator-(Point2D A, const Point2D &B) {
     return Point2D(A.x - B.x, A.y - B.y);
    }
    __device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD 
      Point2D M = 0.5*(C+D);
      return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
    }

    class Polygon {
    public:
      __device__ Polygon():nbpts(0){};
      __device__ void addPts(Point2D pt) {
        pts[nbpts] = pt;
        nbpts++;
      }; 
      __device__ Polygon& operator=(const Polygon& rhs) {
        nbpts = rhs.nbpts;
        dummy = rhs.dummy;
        memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
        return *this;
      }
      __device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

        int new_nbpts = 0;
        Point2D newpts[128];
        Point2D AB(outside_pt-inside_pt);
        Point2D M(0.5*(outside_pt+inside_pt));
        double ABM = dot(AB, M);

        Point2D S = pts[nbpts-1];

        for (int i=0; i<nbpts; i++) {

          Point2D E = pts[i];

          double ddot = -ABM + dot(AB, E);
          if (ddot<0) { // E inside clip edge
            double ddot2 = -ABM + dot(AB, S);
            if (ddot2>0) {
               newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
               new_nbpts++;
            }
            newpts[new_nbpts] = E;
            new_nbpts++;
          } else {
            double ddot2 = -ABM + dot(AB, S);
            if (ddot2<0) {
               newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
               new_nbpts++;
            }       
          }
          S = E;
        }

        memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
        nbpts = new_nbpts;
      }

    //private:
     Point2D pts[128];
     int nbpts;
     float dummy;
    };


    __global__ void cut_poly(float *a, Polygon* polygons, int N)
    {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx>=N/2) return;

      Polygon pol;
      pol.addPts(Point2D(0.,0.));
      pol.addPts(Point2D(1.,0.));
      pol.addPts(Point2D(1.,1.));
      pol.addPts(Point2D(0.,1.));

      Point2D curPt(a[2*idx], a[2*idx+1]);

      for (int i=0; i<N/2; i++) {
        Point2D other_pt(a[2*i], a[2*i+1]);
        pol.cut(curPt, other_pt);
      }
      pol.dummy = a[idx];

      polygons[idx] = pol;
    }



    int main(int argc, unsigned char* argv[])
    {

      const int N = 100; 
      float a_h[N], *a_d; 
      Polygon p_h[N/2], *p_d;

      size_t size = N * sizeof(float);
      size_t size_pol = N/2 * sizeof(Polygon);

      cudaError_t err  = cudaMalloc((void **) &a_d, size);   
      cudaError_t err2 = cudaMalloc((void **) &p_d, size_pol);  

      for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
      cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

      int block_size = 4;
      int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
      cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);

      cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
      cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);

      for (int i=0; i<N/2; i++)
       printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);

      cudaFree(a_d);
      cudaFree(p_d);


        return 0;
    }

Same code in C++ that works properly:

#include <stdio.h>
#include <iostream>
#include <stdlib.h>

class Point2D {
 public:
 Point2D(double xx=0, double yy=0):x(xx),y(yy){};
 double x, y;
};

double dot(const Point2D &A, const Point2D &B) {
 return A.x*B.x + A.y*B.y;
}
Point2D operator*(double a, const Point2D &P) {
 return Point2D(a*P.x, a*P.y);
}
Point2D operator+(Point2D A, const Point2D &B) {
 return Point2D(A.x + B.x, A.y + B.y);
}
Point2D operator-(Point2D A, const Point2D &B) {
 return Point2D(A.x - B.x, A.y - B.y);
}
Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD 
  Point2D M = 0.5*(C+D);
  return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}

class Polygon {
public:
  Polygon():nbpts(0){};
  void addPts(Point2D pt) {
    pts[nbpts] = pt;
    nbpts++;
  }; 
  Polygon& operator=(const Polygon& rhs) {
    nbpts = rhs.nbpts;
    dummy = rhs.dummy;
    memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
    return *this;
  }
  void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

    int new_nbpts = 0;
    Point2D newpts[128];
    Point2D AB(outside_pt-inside_pt);
    Point2D M(0.5*(outside_pt+inside_pt));
    double ABM = dot(AB, M);

    Point2D S = pts[nbpts-1];

    for (int i=0; i<nbpts; i++) {

      Point2D E = pts[i];

      double ddot = -ABM + dot(AB, E);
      if (ddot<0) { // E inside clip edge
        double ddot2 = -ABM + dot(AB, S);
        if (ddot2>0) {
           newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
           new_nbpts++;
        }
        newpts[new_nbpts] = E;
        new_nbpts++;
      } else {
        double ddot2 = -ABM + dot(AB, S);
        if (ddot2<0) {
           newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
           new_nbpts++;
        }
      }
        S = E;
    }

    memcpy(pts, newpts, std::min(128, new_nbpts)*sizeof(Point2D));
    /*for (int i=0; i<128; i++) {
      pts[i] = newpts[i];
    }*/
    nbpts = new_nbpts;
  }

//private:
 Point2D pts[128];
 int nbpts;
 float dummy;
};


void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
  if (idx>=N/2) return;

  Polygon pol;
  pol.addPts(Point2D(0.,0.));
  pol.addPts(Point2D(1.,0.));
  pol.addPts(Point2D(1.,1.));
  pol.addPts(Point2D(0.,1.));

  Point2D curPt(a[2*idx], a[2*idx+1]);

  for (int i=0; i<N/2; i++) {
    if (idx==i) continue;
    Point2D other_pt(a[2*i], a[2*i+1]);
    pol.cut(curPt, other_pt);
  }
  pol.dummy = a[idx];

  polygons[idx] = pol;
}



int main(int argc, unsigned char* argv[])
{

  const int N = 100;  // Number of elements in arrays
  float a_h[N], *a_d;  // Pointer to host & device arrays
  Polygon p_h[N/2], *p_d;

  for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;

  for (int idx=0; idx<N; idx++)
    cut_poly(idx, a_h, p_h, N);

  for (int i=0; i<N/2; i++)
   printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);

   return 0;
}

Before we check the entire slab of code, anything you've narrowed it down to? — Bart, Nov 29 '12 at 17:01
yes - I tried that yesterday with this question : http://stackoverflow.com/questions/13619025/cuda-strange-bug but apparently, narrowing down the code was a bad idea since the bug was not exhibited anymore but the behavior of the algorithm was now entirely different. :s — nbonneel, Nov 29 '12 at 17:04
Before we go through the code, check the return values from the `cudaMemcpy()` calls for errors. — tera, Nov 29 '12 at 17:09
actually you're getting an unspecified launch failure from the kernel. It's always good practice to check *all* cuda calls for errors. I don't really see any actual error checking in your code. I don't know what the unspecified launch failure is about, yet. The next step might be to successively remove or comment out pieces of the kernel, until the launch failure goes away, not unlike narrowing in on a seg fault in CPU code. — Robert Crovella, Nov 29 '12 at 17:35
This line of kernel code, if commented out, makes the launch error go away ( and some additional data returned from the kernel is now printed out): `pol.cut(curPt, other_pt);` — Robert Crovella, Nov 29 '12 at 17:44
thanks - I was expecting this function to be the source of the problem, but I don't see any issue with it, since it works in the serial version :s Even enforcing the fact that `new_nbpts` should be smaller than 127 by an `if` statement doesn't solve anything... — nbonneel, Nov 29 '12 at 18:42
Actually I see that as the actual reason for the launch failure. If I enforce new_nbpts to be less than 127 via: `if ((ddot2>0) && (new_nbpts < 127))` (and likewise for the if (ddot...) in the else clause) then the launch failure goes away. It still does not match the output from the cpu version, but it's an important clue. I wouldn't be quick to discard it. You should analyze the code to find out why your expected bound is being exceeded. — Robert Crovella, Nov 29 '12 at 18:46
I just tried it but my laptop crashed - I'll investigate more. But that seems very strange : the algorithm starts with a quad and cuts it with N/2 lines (N=100), which each could produce 2 intersection points. So, a large overestimate for the size of the table storing these points would be 104 (including the 4 initial points), but it usually should be much more like 6 or 7. I gave 128 just in case... — nbonneel, Nov 30 '12 at 06:24

Robert Crovella · Accepted Answer · 2012-12-01T00:24:24.437

Well I guess you can disregard most of my comments. I was by mistake working on a machine I had set up with CUDA 3.2 and it was behaving differently along the lines of the kernel launch failure. When I switched to CUDA 4.1 and CUDA 5.0 things started to make sense. Apologies for my confusion there.

Anyway after getting past that, I pretty quickly noticed that there is a difference between your CPU and GPU implementations. Specifically here (looking at the CPU code):

void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
  if (idx>=N/2) return;

  Polygon pol;
  pol.addPts(Point2D(0.,0.));
  pol.addPts(Point2D(1.,0.));
  pol.addPts(Point2D(1.,1.));
  pol.addPts(Point2D(0.,1.));

  Point2D curPt(a[2*idx], a[2*idx+1]);

  for (int i=0; i<N/2; i++) {
    if (idx==i) continue;     /*   NOTE THIS LINE MISSING FROM YOUR GPU CODE */
    Point2D other_pt(a[2*i], a[2*i+1]);
    pol.cut(curPt, other_pt);
  }
  pol.dummy = a[idx];

  polygons[idx] = pol;
}

Referring to the line I have added the comment to above, if you add that exact line of code to the corresponding spot in your GPU code in the cut_poly kernel, then for me anyway the GPU code produces the same printed result as the CPU code.

One other observation I would make is that you are needlessly running blocks with just 4 threads. Nothing wrong with that while you're working out the kinks in the code, but once you have it running for "production" purposes, you will most likely want to target a higher number like 256, and be sure to choose a number that is an integer multiple of 32, for best performance.

In response to a question posted in the comments, I believe that the data is being copied properly, but most likely you are not accessing it correctly on the host. (I don't know how you're determining that "my array is not properly returned to the host"). Most of your class definitions were __device__ only. As a result, it's difficult to access structures within classes on the host (e.g. the Point2D pts class within the Polygon class). I'm inserting modified code here which I think demonstrates that the data is being transferred back to the host:

    #include <stdio.h>
    #include <iostream>
    #include <stdlib.h>
//    #include <cuda.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


    class Point2D {
     public:
     __host__ __device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
     double x, y;
    };

    __host__ __device__ double dot(const Point2D &A, const Point2D &B) {
     return A.x*B.x + A.y*B.y;
    }
    __host__ __device__ Point2D operator*(double a, const Point2D &P) {
     return Point2D(a*P.x, a*P.y);
    }
    __host__ __device__ Point2D operator+(Point2D A, const Point2D &B) {
     return Point2D(A.x + B.x, A.y + B.y);
    }
    __host__ __device__ Point2D operator-(Point2D A, const Point2D &B) {
     return Point2D(A.x - B.x, A.y - B.y);
    }
    __host__ __device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
      Point2D M = 0.5*(C+D);
      return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
    }

    class Polygon {
    public:
      __host__ __device__ Polygon():nbpts(0){};
      __host__ __device__ void addPts(Point2D pt) {
        pts[nbpts] = pt;
        nbpts++;
      };
      __host__ __device__ Polygon& operator=(const Polygon& rhs) {
        nbpts = rhs.nbpts;
        dummy = rhs.dummy;
        memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
        return *this;
      }
      __host__ __device__ Point2D getpoint(unsigned i){
        if (i<128) return pts[i];
        else return pts[0];
        }
      __host__ __device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

        int new_nbpts = 0;
        Point2D newpts[128];
        Point2D AB(outside_pt-inside_pt);
        Point2D M(0.5*(outside_pt+inside_pt));
        double ABM = dot(AB, M);

        Point2D S = pts[nbpts-1];

        for (int i=0; i<nbpts; i++) {

          Point2D E = pts[i];

          double ddot = -ABM + dot(AB, E);
          if (ddot<0) { // E inside clip edge
            double ddot2 = -ABM + dot(AB, S);
            if (ddot2>0) {
               newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
               new_nbpts++;
            }
            newpts[new_nbpts] = E;
            new_nbpts++;
          } else {
            double ddot2 = -ABM + dot(AB, S);
            if (ddot2<0) {
               newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
               new_nbpts++;
            }
          }
          S = E;
        }

        memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
        nbpts = new_nbpts;
      }

    //private:
     Point2D pts[128];
     int nbpts;
     float dummy;
    };


    __global__ void cut_poly(float *a, Polygon* polygons, int N)
    {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx>=N/2) return;

      Polygon pol;
      pol.addPts(Point2D(0.,0.));
      pol.addPts(Point2D(1.,0.));
      pol.addPts(Point2D(1.,1.));
      pol.addPts(Point2D(0.,1.));

      Point2D curPt(a[2*idx], a[2*idx+1]);

      for (int i=0; i<N/2; i++) {
        if (idx==i) continue;
        Point2D other_pt(a[2*i], a[2*i+1]);
        pol.cut(curPt, other_pt);
      }
      pol.dummy = pol.getpoint(0).x;

      polygons[idx] = pol;
    }



    int main(int argc, unsigned char* argv[])
    {

      const int N = 100;
      float a_h[N], *a_d;
      Polygon p_h[N/2], *p_d;

      size_t size = N * sizeof(float);
      size_t size_pol = N/2 * sizeof(Polygon);

      cudaMalloc((void **) &a_d, size);
      cudaCheckErrors("cm1");
      cudaMalloc((void **) &p_d, size_pol);
      cudaCheckErrors("cm2");

      for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
      cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
      cudaCheckErrors("cmcp1");

      int block_size = 128;
      int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
      cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);
      cudaCheckErrors("kernel");

      cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
      cudaCheckErrors("cmcp2");
      cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);
      cudaCheckErrors("cmcp3");

      for (int i=0; i<N/2; i++)
       printf("%f \t %f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].getpoint(0).x, p_h[i].nbpts);

      cudaFree(a_d);
      cudaFree(p_d);


        return 0;
    }

I would suggest using posting new questions for these things.

Arrrgg! Thanks a lot! I was debugging my serial and parallel implementation in parallel, and I missed a correction I did in the serial code...! Many thanks for spotting the bug and for spending time on it! — nbonneel, Nov 30 '12 at 21:30
in fact there is still a problem with this code that I can't also figure out : the Polygons that are returned to the host all have their vertices with coordinates set to 0 (in `pol[i].pts[...]`) instead of their actual values (although the number of vertices is now correct and is in the variable `nbpoints`). :s I checked that my Polygon::operator= is properly called. It's just that my array is not properly returned to the host. — nbonneel, Nov 30 '12 at 23:20
Note that if I output one of the polygon's vertices coordinate in my `dummy` variable, then I get the correct answer. — nbonneel, Nov 30 '12 at 23:27
I've edited my answer to respond to this question. Please consider posting new SO questions instead, as this one will get horribly convoluted otherwise. — Robert Crovella, Dec 01 '12 at 00:24
ok - I'll post a new question. However, copying and pasting your exact same piece of code results in all the `p_h[i].getpoint(0)` displayed to be `0.000000`. I'll check whether it's a bug from my driver by testing on another maching before posting a new question. Thanks again! — nbonneel, Dec 01 '12 at 01:43
oh ok - that's indeed a driver error since the code is working on my desktop machine, but not my laptop. Thanks for all! — nbonneel, Dec 01 '12 at 02:05
Puzzling. I get matching values between p_h[i].getpoint(0).x and p_h[i].dummy (and they are not all zero). If there was a problem with the driver or cuda install, I would expect one of my error checking lines to throw an error. Did you copy and paste my *entire* code and run it as-is? If you post a new question, please include which GPU you are running on as well as your installed cuda version. — Robert Crovella, Dec 01 '12 at 02:11
yes I copied and paste your entire code and no error was reported. I don't really mind if it's not working on my laptop as I recently ordered a new one which should arrive soon (just for info, the cuda version is 5.0. My laptop's GPU is a GeForce G102M while my desktop computer is a GTX 480) — nbonneel, Dec 01 '12 at 02:33
Hi @RobertCrovella, I uncounted a strange problem in CUDA, can you help me to look at it? I put my declaration and definition separated in .cu and .cuh files. It compiles without any problem but when the kernel is running, I have cuda error code 77, invalid memory access error. Once I put the definition in .cuh file and use __forceinline__ option, the invalid memory access error gone. Do you have any idea what's the problem? Thank you — Charles Chow, Apr 02 '15 at 04:55

Cuda version not working while serial working

1 Answers1

Linked