Kernel launch failure - Takes -100% of the time running on GPU

Question

The kernel update_umatrix fails to start and the profiler shows that it takes -100% ! the time of computation.

It is probably a simple problem but I have spent two weeks on it and still the kernel launch somehow fails to start according to nsight profiler, at least the U matrix doesn't update and containts all zeros (It's a partial implementation of FCM).

My GPU is GeForce 330M with Compute Capability of 1.2.

float *U;
float *V;
float *X;

__device__ float *U_d;
__device__ float *V_d;
__device__ float *X_d;

__global__ void update_umatrix(float *sqrerror,int C,int N,int S,float m)
{

    int i,j,k;
    int example_is_centroid;
    float summation, D_ki, D_kj;
    float newU;

    __shared__ float tmp_sqrerror[DIM];
    /* For each example in the dataset */
    k = threadIdx.x + blockIdx.x*blockDim.x;
    int local_offset = threadIdx.x;
    tmp_sqrerror[local_offset]=0;
        /* Special case: If Example is equal to a Cluster Centroid,
       then U=1.0 for that cluster and 0 for all others */
        if ( (example_is_centroid=is_example_centroid(k,S,C)) != -1 ) {
            for(int i=0; i<C; i++)
            {
            if ( i == example_is_centroid )
                U_d[k*C+i]=1.0;
            else
                U_d[k*C+i]=0.0;
            }
            return;
        }
    /* For each class */
    for(int i=0; i< C; i++)
    {
        summation=0;

        /* Calculate summation */
        for (j=0; j < C; j++) {
            D_ki=distance(X_d, V_d,k*DIM,i*S,S);
            D_kj=distance(X_d, V_d,k*DIM,j*S,S);
            summation += powf( D_ki / D_kj , (2.0/ (m-1)));
        }

        /* Weight is 1/sum */
        newU=1.0/summation;

        /* Add to the squareDifference */
        tmp_sqrerror[local_offset] += powf(U_d[k*C+i] - newU, 2);

        U_d[k*C+i]=newU;

    }
    __syncthreads();
    int t= blockDim.x/2;
    while(t>0)
    {
        if(k+t < N && threadIdx.x<t)
            tmp_sqrerror[local_offset] += tmp_sqrerror[local_offset+t];
        t/=2;
        __syncthreads();
    }

    if(threadIdx.x==0)
        sqrerror[blockIdx.x] = tmp_sqrerror[0];

}




int init()
{

float m = 2.0;
int C=2;
int S=2;
int N=340*340;
    int i,j;

    /* Allocate necessary storage */
    V=(float *)CALLOC(S*C, sizeof(float));

    U=(float *)CALLOC(C*N,sizeof(float));
    cudaGetErrorString(cudaMalloc(&U_d,N*C*sizeof(float)));
    cudaGetErrorString(cudaMalloc(&V_d,C*S*sizeof(float)));

    /* Place random values in V, then update U matrix based on it */
    srand48(seed);
    for (i=0; i < C; i++) {
        for (j=0; j < S; j++) {
            V[i*S+j]=drand48() * max_value[j];
        }
    }
    float *dummy;
    cudaMalloc(&dummy,N*sizeof(float));
    cudaGetErrorString(cudaMemcpyToSymbol(&V_d,V,C*S*sizeof(float),0,cudaMemcpyHostToDevice));
    /* Once values are populated in V, update the U Matrix for sane values */
    update_umatrix<<<(N+DIM-1)/DIM,DIM>>>(dummy,C,N,S,m);
    cudaGetErrorString(cudaGetLastError());
cudaDeviceSynchronize();

cudaGetErrorString(cudaMemcpyFromSymbol(U,&U_d,N*C*sizeof(float),cudaMemcpyDeviceToHost));
fprintf(stdout,"Initialization completed.\n");

    return 0;
}

If X[k] == V[i] for some i, then return that i. Otherwise, return -1

__device__ int is_example_centroid(int k,int S, int C)
{
    int  i,x;

    for (i=0; i < C; i++) {
        for (x=0; x < S; x++) {
            if ( X_d[k*DIM+x] != V_d[i*S+x] ) break;
        }
        if ( x == S )  /* X==V */
            return i;
    }
    return -1;
}

And the distance function:

__device__ float distance(float *v1, float *v2,int startV1,int startV2,int S)
{
    int x,i;
    float sum=0;

    for (x=startV1,i=startV2; x < startV1+DIM && i<startV2+S; x++, i++)
        sum += (v1[x] - v2[i]) * (v1[x] - v2[i]);

    return sqrt(sum);
}

[pastebin](http://pastebin.com/qwG7jAM5) not sure how to read it. — Soroosh Bateni, Apr 07 '13 at 22:04

Robert Crovella · Accepted Answer · 2013-04-07T23:22:20.690

This line of code is invalid:

cudaGetErrorString(cudaMemcpyToSymbol(&V_d,V,C*S*sizeof(float),0,cudaMemcpyHostToDevice));

It will compile, but it will throw an error at runtime. Since you appear to have it wrapped with error checking, I can only assume your error checking is broken.

The symbol you pass to cudaMemcpyToSymbol must be a valid symbol ONLY. It cannot be the address of a symbol, a symbol plus an offset or anything like that.

I also believe this line of code is not sensible, although I can't prove to myself that it cannot work:

cudaGetErrorString(cudaMalloc(&V_d,C*S*sizeof(float)));

You can probably do this if you want, but I'm not sure it's doing what you want, or that the malloc'ed region is in any way accessible from the host.

If you want variable size dynamic device allocations, why not just use ordinary cudaMalloc methods? Why the usage of device symbols? I'm not saying you can't make it work somehow, but this is not the way to do it.

EDIT responding to a question below: If you want to eliminate a function parameter and use a device variable instead, you can probably make it work but it seems like a lot of trouble to me, and to what end?

Anyway this is what I would do if I felt I really needed to do that:

#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)




float *V, *d_V;

__device__ float *V_d;

__global__ void my_kernel(){

  printf("V[3] = %f\n", V_d[3]);

}

int main() {

  int C=2;
  int S=2;

  V=(float *)calloc(S*C, sizeof(float));
  V[0] = 0.0f;
  V[3] = 4.0f;
  cudaMalloc((void **)&d_V,C*S*sizeof(float));
  cudaCheckErrors("malloc");
  cudaMemcpy(d_V, V, C*S*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("memcpy");
  cudaMemcpyToSymbol(V_d,&d_V,sizeof(float *));
  cudaCheckErrors("symbol");
  my_kernel<<<1,1>>>();
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel");

  return 0;
}

You mean I can't directly allocate a `__device__` variable? Assigning a host pointer to it is the only way? I tend to eliminate the need to pass a pointer to the kernel. — Soroosh Bateni, Apr 07 '13 at 23:06
And also I had a working wrapper but thought it was broken since this one didn't generate any error! How on earth would this be broken?! — Soroosh Bateni, Apr 07 '13 at 23:10
Nice thank you. You are right it's not worth the trouble this way. — Soroosh Bateni, Apr 07 '13 at 23:24
Since you haven't shown the code for what your wrapper does, I'm unable to respond to the question "How on earth would this be broken?" You may want to review good error checking advice [here](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api), or look at the example I posted, which I also consider to be robust. — Robert Crovella, Apr 07 '13 at 23:37

Kernel launch failure - Takes -100% of the time running on GPU

1 Answers1