19

This is part of my CUDA code. But last part of this code says some error message.

unsigned int *mat_count;
off_t *mat_position;
unsigned int *matches_count;
off_t *matches_position;
......
cudaMalloc ( (void **) &mat_count,    sizeof(unsigned int)*10);
cudaMalloc ( (void **) &mat_position, sizeof(off_t)*10);
......
matches_count    = (unsigned int *)malloc(sizeof(unsigned int)*10);
matches_position = (off_t *)malloc(sizeof(off_t)*10);
for ( i = 0 ; i < 10 ; i++ ) {
    matches_count   [i] = 0;
    matches_position[i] = 0;
}
......
cudaMemcpy (mat_count,    matches_count   , sizeof(unsigned int)*10, cudaMemcpyHostToDevice );
cudaMemcpy (mat_position, matches_position, sizeof(off_t)*10,        cudaMemcpyHostToDevice );
......
match<<<BLK_SIZE,THR_SIZE>>>(
        reference_total,
        indextable_total,
        sequences, 
        start_sequence, 
        sequence_length, 
        end_sequence,
        ref_base,
        idx_base,
        msk_base,
        mat_count,
        mat_position,
        reference,
        first_indexes,
        seqmaskc
        );
err=cudaGetLastError();
if(err!=cudaSuccess)
{
printf("\n1 %s\n", cudaGetErrorString(err));
}
err=    cudaMemcpy (matches_count   , mat_count,    sizeof(unsigned int)*10, cudaMemcpyDeviceToHost );
if(err!=cudaSuccess)
{
printf("\n2 %s\n", cudaGetErrorString(err));
}
err=    cudaMemcpy (matches_position, mat_position, sizeof(off_t)*10, cudaMemcpyDeviceToHost );
if(err!=cudaSuccess)
{
printf("\n3 %s\n", cudaGetErrorString(err));
}

The following part of code had reported "unspecified launch failure" this error message. I don't know why this error message is reported.

err=cudaMemcpy (matches_position, mat_position, sizeof(off_t)*10, cudaMemcpyDeviceToHost );
if(err!=cudaSuccess)
{
printf("\n3 %s\n", cudaGetErrorString(err));
}

The followings are part of match function.

__global__ void match(...)
{
    ......
reference_blk = (THR_SIZE * blockIdx.x + threadIdx.x) * 32 + reference;
......
//-- added for parallize --//
for (p = start_p ; p != last_p ; p++) {
    for ( s = start_sequence, sequence = sequences ; s != end_sequence ;
            s++, sequence += sequence_bytes ) {
        ref_off = *(((unsigned int*)(idx_base)) + p);

        shifted_in = 0;

        if((int)(first_indexes[s-start_sequence] % 8 - ref_off % 8) < 0){
            int shamt2 = (ref_off % 8 - first_indexes[s-start_sequence] % 8);

            mask_buffer = *((unsigned long *)(msk_base + (ref_off - first_indexes[s-start_sequence])/8)) >> shamt2;

            if( ( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence))) ^ mask_buffer ) << shamt2) continue;
        }
        else if((int)(first_indexes[s-start_sequence] % 8 - ref_off % 8) == 0){
            mask_buffer = *((unsigned long *)(msk_base + (ref_off)/8));

            if( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence)) ^ mask_buffer)) continue;
        }
        else{
            int shamt2 = 8 - (first_indexes[s-start_sequence] % 8 - ref_off % 8);

            mask_buffer = *((unsigned long *)(msk_base + (ref_off/8- first_indexes[s-start_sequence]/8) - 1)) >> shamt2;

            if( ( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence))) ^ mask_buffer ) << shamt2) continue;
        }

        //full compare
        if((int)(first_indexes[s-start_sequence] % 4 - ref_off % 4) < 0){
            int shamt = (ref_off % 4 - first_indexes[s-start_sequence] % 4) * 2;
            memcpy(reference_blk, ref_base + ref_off / 4 - first_indexes[s-start_sequence] / 4, sequence_bytes);
            ......
            //-- instead of memcmp --//
            int v = 0;
            char *p1 = (char *)sequence;
            char *p2 = (char *)reference_blk;
            int tmp_asd = sequence_bytes;
            while(tmp_asd!=0){
                v = *(p1++) - *(p2++);
                if(v!=0)
                    break;
                tmp_asd--;
            }

            if(v == 0){
                mat_count[s - (int)start_sequence]++;      /* Maintain count */
                mat_position[s - (int)start_sequence] = ref_off-first_indexes[s-start_sequence]; /* Record latest position */
            }
        }
        else if((int)(first_indexes[s-start_sequence] % 4 - ref_off % 4 )== 0){
            memcpy(reference_blk, ref_base + ref_off / 4 - first_indexes[s-start_sequence] / 4, sequence_bytes);
            .......
            //-- instead of memcmp --//
            int v = 0;
            char *p1 = (char *)sequence;
            char *p2 = (char *)reference_blk;
            int tmp_asd = sequence_bytes;
            while(tmp_asd!=0){
                v = *(p1++) - *(p2++);
                if(v!=0)
                    break;
                tmp_asd--;
            }
            if(v == 0){
                mat_count[s - (int)start_sequence]++;      /* Maintain count */
                mat_position[s - (int)start_sequence] = ref_off-first_indexes[s-start_sequence]; /* Record latest position */
            }
        }
        else
        {
            int shamt = 8 - (first_indexes[s-start_sequence] % 4 - ref_off % 4) * 2;

            memcpy(reference_blk, ref_base + ref_off / 4 - first_indexes[s-start_sequence] / 4 - 1, 32);
            ......
            //-- instead of memcmp --//
            int v = 0;
            char *p1 = (char *)sequence;
            char *p2 = (char *)reference_blk;
            int tmp_asd = sequence_bytes;
            while(tmp_asd!=0){
                v = *(p1++) - *(p2++);
                if(v!=0)
                    break;
                tmp_asd--;
            }

            if (v == 0){
                mat_count[s - (int)start_sequence]++;      /* Maintain count */
                mat_position[s - (int)start_sequence] = ref_off-first_indexes[s-start_sequence];/* Record latest position */
            }
        }
    }
}

}

Jimmy
  • 473
  • 5
  • 9
  • 13
  • What are the values of `BLK_SIZE` and `THR_SIZE` ? – Paul R Mar 28 '12 at 05:56
  • BLK_SIZE and THR_SIZE are one. – Jimmy Mar 28 '12 at 05:59
  • You probably need to post the code for the kernel function `match` – Paul R Mar 28 '12 at 06:41
  • That means you have memory violation. Run under debugger or memory checker. – Anycorn Mar 28 '12 at 07:02
  • You need to call cudaThreadSynchronize() before you can call cudaGetLastError() because kernels are launched asynchronously. Your kernel probably has not even finished by the time you call cudaGetLastError(). And then you are mucking around with cuda memory. Not saying this is your problem, but it could help. – Apprentice Queue Mar 28 '12 at 14:47
  • 1
    And have you checked that the numerous offsets that you calculate are valid? Especially when calling memcpy()? – Apprentice Queue Mar 28 '12 at 15:07
  • It's worth pointing out to newcomers of this thread that my program only failed to stop working after I started to increase the number of grids/threads; I was violating memory long before then, one of my arrays was smaller than the others and I hadn't accounted for it - it was still working successfully at lower G/T counts. (admittedly with minimum error checking) – Phill Feb 11 '16 at 09:09

3 Answers3

47

An unspecified launch failure is almost always a segfault. You've got an indexing mistake somewhere in your kernel, probably while accessing global memory.

I'd look through your code, but it's mildly incomprehensible...

P O'Conbhui
  • 1,203
  • 1
  • 9
  • 16
31

Compile your application with debug flags nvcc -G -g and try running your application inside cuda-memcheck or cuda-gdb. It might give you a hint where the problem might lie.

Just run

cuda-memcheck ./yourApp
Przemyslaw Zych
  • 2,000
  • 1
  • 21
  • 24
0

For me CUDA was generating "unspecified launch failure" due to an infinite recursion not detected by nvcc. The code was doing simply:

int f() {
  return f();
}

The backtrace in cuda-gdb showed some arbitrary code rather than the source of the error.

Serge Rogatch
  • 13,865
  • 7
  • 86
  • 158