0

I am using Nvidia Nsight to debug my code and I noticed that my distance kernel is sometimes not launching. The code attempts to find the distance between two polygons. It does this by going through each segment of each polygon and finding the distance between the two segments. The following is my code:

__device__ double point_segment_distance(double px, double py, double x1, double y1, double x2, double y2)
{
    double dx = x2 - x1;
    double dy = y2 - y1;
    //
    //if (dx < 0.01 && dy < 0.01)
    //{
    //  return hypot(px - x1, py - y1);
    //}
    //double t = ((px - x1) * dx + (py - y1) * dy) / (dx * dx + dy * dy);

    //if (t < 0)
    //{
    //  dx = px - x1;
    //  dy = py - y1;
    //}
    //else if (t > 1)
    //{
    //  dx = px - x2;
    //  dy = py - y2;
    //}
    //else
    //{
    //  double near_x = x1 + t * dx;
    //  double near_y = y1 + t * dy;
    //  dx = px - near_x;
    //  dy = py - near_y;
    //}

    //return hypot(dx, dy);
    return 10.0;
}

__device__ bool segments_intersect(double x11, double y11, double x12, double y12, double x21, double y21, double x22, double y22)
{
    double dx1 = x12 - x11;
    double dy1 = y12 - y11;
    double dx2 = x22 - x21;
    double dy2 = y22 - y21;
    double delta = dx2 * dy1 - dy2 * dx1;
    if (delta < 0.01)
    {
        return false;
    }
    double s = (dx1 * (y21 - y11) + dy1 * (x11 - x21)) / delta;
    double t = (dx2 * (y11 - y21) + dy2 * (x21 - x11)) / (-delta);
    return (0 <= s && s <= 1 && 0 <= t && t <= 1);
}

__device__ double segments_distance(double x11, double y11, double x12, double y12, double x21, double y21, double x22, double y22)
{
    if (segments_intersect(x11, y11, x12, y12, x21, y21, x22, y22))
    {
        return 0.0;
    }
    double minimumDist = 999999;
    double tempDist = point_segment_distance(x11, y11, x21, y21, x22, y22);
    if (tempDist < minimumDist)
    {
        minimumDist = tempDist;
    }
    tempDist = point_segment_distance(x12, y12, x21, y21, x22, y22);
    if (tempDist < minimumDist)
    {
        minimumDist = tempDist;
    }
    tempDist = point_segment_distance(x21, y21, x11, y11, x12, y12);
    if (tempDist < minimumDist)
    {
        minimumDist = tempDist;
    }
    tempDist = point_segment_distance(x22, y22, x11, y11, x12, y12);
    if (tempDist < minimumDist)
    {
        minimumDist = tempDist;
    }
    return minimumDist;
}

__global__ void distance(double *x0, double *y0, double *x1, double *y1, double *dist, int *length0, int *length1, int *numDone)
{
    int numComp = threadIdx.x + blockDim.x*blockIdx.x + *numDone;
    int index = threadIdx.x + blockDim.x*blockIdx.x;
    dist[index] = 99999;
    if  (numComp < ((*length0)*(*length1)))
    {
        int spot0 = numComp%(*length0);
        int spot1 = numComp/(*length0);
        dist[index] = segments_distance(x0[spot0], y0[spot0], x0[(spot0+1)%(*length0)], y0[(spot0+1)%(*length0)], x1[spot1], y1[spot1], x1[(spot1+1)%(*length1)], y1[(spot1+1)%(*length1)]);
    }
}

void gpuDistance(double *x0, double *y0, double *x1, double *y1)
{
    ...
    distance<<<165, 1024>>>(dev_x0, dev_y0, dev_x1, dev_y1, dev_dist, dev_length0, dev_length1, dev_numDone);
    ...
}

I commented out much of point_segment_distance in order to help me locate the error. This will not launch the distance kernel. I know this because I am using Nsight Cuda Debugging and it doesn't hit my breakpoints.

However, if I comment the line "double dy = y2 - y1;" in point_segment_distance the distance kernel will launch. How is this possible? Why would creating one more double cause the kernel to not launch. Is there a limit to the number of doubles that may be created on the GPU. I have a Tesla 2075. I am aware of the local memory limit of 512kb. However, looking at my code I can't imagine that I'm anywhere near that limit. Thanks for any help!

Miggy
  • 79
  • 1
  • 6
  • 1
    are you sure that your kernel is not launched? not hitting breakpoins could be due to aggressive optimizations performed by the compiler. you should perform proper cuda error checking. – Vitality Oct 22 '13 at 16:26
  • You are correct! Thank you so much! The kernel is still launching. I have a cudamemcpy function after i call distance. When I uncomment "double dy = y2 - y1;" the breakpoint doesn't catch but it returns values of 10.0. When I comment "double dy = y2 - y1;" the breakpoint does catch and it still returns values of 10.0. This is very strange behavior to me. I can't believe one extra line of code will cause the debugger to ignore an entire function. Oh well. Can you make your comment into an answer so I can accept it? Thanks again. – Miggy Oct 22 '13 at 16:39

1 Answers1

2

Not hitting breakpoints does not mean that the kernel is not executed since the compiler has the freedom to perform aggressive optimizations on the code. To check the correctness of kernel launches, you should better perform canonical CUDA error checking in the sense of talonmies' post

What is the canonical way to check for errors using the CUDA runtime API?

To have an idea of the optimizations the compiler can perform, consider for example the following code

__global__ void point_segment_distance(double* distance_squared, const double* __restrict__ x1, const double* __restrict__ y1, const double* __restrict__ x2, const double* __restrict__ y2)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    double dx = x2[i] - x1[i];
    double dy = y2[i] - y1[i];

    //distance_squared[i] = dx*dx+dy*dy;
}

Note the commented instruction. When such an instruction is illustrated, then everything inside the kernel function becomes dead code, since it will not contribute to global memory data, and is eliminated by the compiler. Indeed, the disassembled code becomes

/*0000*/        MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */ 
/*0008*/        EXIT ;                 /* 0x8000000000001de7 */ 

When the above instruction is uncommented, then the compiler will produce

/*0000*/        MOV R1, c[0x1][0x100];                /* 0x2800440400005de4 */
/*0008*/        S2R R2, SR_CTAID.X;                   /* 0x2c00000094009c04 */
/*0010*/        S2R R3, SR_TID.X;                     /* 0x2c0000008400dc04 */
/*0018*/        MOV32I R0, 0x8;                       /* 0x1800000020001de2 */
/*0020*/        IMAD R18, R2, c[0x0][0x8], R3;        /* 0x2006400020249ca3 */  
/*0028*/        IMAD R8.CC, R18, R0, c[0x0][0x38];    /* 0x20018000e1221ca3 */
/*0030*/        IMAD.HI.X R9, R18, R0, c[0x0][0x3c];  /* 0x20808000f1225ce3 */
/*0038*/        IMAD R16.CC, R18, R0, c[0x0][0x40];   /* 0x2001800101241ca3 */
/*0040*/        LD.E.64 R10, [R8];                    /* 0x8400000000829ca5 */
/*0048*/        IMAD.HI.X R17, R18, R0, c[0x0][0x44]; /* 0x2080800111245ce3 */
/*0050*/        IMAD R12.CC, R18, R0, c[0x0][0x30];   /* 0x20018000c1231ca3 */
/*0058*/        LD.E.64 R4, [R16];                    /* 0x8400000001011ca5 */
/*0060*/        IMAD.HI.X R13, R18, R0, c[0x0][0x34]; /* 0x20808000d1235ce3 */
/*0068*/        IMAD R6.CC, R18, R0, c[0x0][0x28];    /* 0x20018000a1219ca3 */
/*0070*/        LD.E.64 R2, [R12];                    /* 0x8400000000c09ca5 */
/*0078*/        IMAD.HI.X R7, R18, R0, c[0x0][0x2c];  /* 0x20808000b121dce3 */
/*0080*/        LD.E.64 R14, [R6];                    /* 0x8400000000639ca5 */
/*0088*/        DADD R2, R4, -R2;                     /* 0x4800000008409d01 */
/*0090*/        DMUL R6, R2, R2;                      /* 0x5000000008219c01 */
/*0098*/        DADD R4, R10, -R14;                   /* 0x4800000038a11d01 */
/*00a0*/        IMAD R2.CC, R18, R0, c[0x0][0x20];    /* 0x2001800081209ca3 */
/*00a8*/        DFMA R4, R4, R4, R6;                  /* 0x200c000010411c01 */
/*00b0*/        IMAD.HI.X R3, R18, R0, c[0x0][0x24];  /* 0x208080009120dce3 */
/*00b8*/        ST.E.64 [R2], R4;                     /* 0x9400000000211ca5 */
/*00c0*/        EXIT ;                                /* 0x8000000000001de7 */

and the code is not dead anymore.

Community
  • 1
  • 1
Vitality
  • 20,705
  • 4
  • 108
  • 146