CUDA error message : unspecified launch failure

Question

This is part of my CUDA code. But last part of this code says some error message.

unsigned int *mat_count;
off_t *mat_position;
unsigned int *matches_count;
off_t *matches_position;
......
cudaMalloc ( (void **) &mat_count,    sizeof(unsigned int)*10);
cudaMalloc ( (void **) &mat_position, sizeof(off_t)*10);
......
matches_count    = (unsigned int *)malloc(sizeof(unsigned int)*10);
matches_position = (off_t *)malloc(sizeof(off_t)*10);
for ( i = 0 ; i < 10 ; i++ ) {
    matches_count   [i] = 0;
    matches_position[i] = 0;
}
......
cudaMemcpy (mat_count,    matches_count   , sizeof(unsigned int)*10, cudaMemcpyHostToDevice );
cudaMemcpy (mat_position, matches_position, sizeof(off_t)*10,        cudaMemcpyHostToDevice );
......
match<<<BLK_SIZE,THR_SIZE>>>(
        reference_total,
        indextable_total,
        sequences, 
        start_sequence, 
        sequence_length, 
        end_sequence,
        ref_base,
        idx_base,
        msk_base,
        mat_count,
        mat_position,
        reference,
        first_indexes,
        seqmaskc
        );
err=cudaGetLastError();
if(err!=cudaSuccess)
{
printf("\n1 %s\n", cudaGetErrorString(err));
}
err=    cudaMemcpy (matches_count   , mat_count,    sizeof(unsigned int)*10, cudaMemcpyDeviceToHost );
if(err!=cudaSuccess)
{
printf("\n2 %s\n", cudaGetErrorString(err));
}
err=    cudaMemcpy (matches_position, mat_position, sizeof(off_t)*10, cudaMemcpyDeviceToHost );
if(err!=cudaSuccess)
{
printf("\n3 %s\n", cudaGetErrorString(err));
}

The following part of code had reported "unspecified launch failure" this error message. I don't know why this error message is reported.

err=cudaMemcpy (matches_position, mat_position, sizeof(off_t)*10, cudaMemcpyDeviceToHost );
if(err!=cudaSuccess)
{
printf("\n3 %s\n", cudaGetErrorString(err));
}

The followings are part of match function.

__global__ void match(...)
{
    ......
reference_blk = (THR_SIZE * blockIdx.x + threadIdx.x) * 32 + reference;
......
//-- added for parallize --//
for (p = start_p ; p != last_p ; p++) {
    for ( s = start_sequence, sequence = sequences ; s != end_sequence ;
            s++, sequence += sequence_bytes ) {
        ref_off = *(((unsigned int*)(idx_base)) + p);

        shifted_in = 0;

        if((int)(first_indexes[s-start_sequence] % 8 - ref_off % 8) < 0){
            int shamt2 = (ref_off % 8 - first_indexes[s-start_sequence] % 8);

            mask_buffer = *((unsigned long *)(msk_base + (ref_off - first_indexes[s-start_sequence])/8)) >> shamt2;

            if( ( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence))) ^ mask_buffer ) << shamt2) continue;
        }
        else if((int)(first_indexes[s-start_sequence] % 8 - ref_off % 8) == 0){
            mask_buffer = *((unsigned long *)(msk_base + (ref_off)/8));

            if( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence)) ^ mask_buffer)) continue;
        }
        else{
            int shamt2 = 8 - (first_indexes[s-start_sequence] % 8 - ref_off % 8);

            mask_buffer = *((unsigned long *)(msk_base + (ref_off/8- first_indexes[s-start_sequence]/8) - 1)) >> shamt2;

            if( ( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence))) ^ mask_buffer ) << shamt2) continue;
        }

        //full compare
        if((int)(first_indexes[s-start_sequence] % 4 - ref_off % 4) < 0){
            int shamt = (ref_off % 4 - first_indexes[s-start_sequence] % 4) * 2;
            memcpy(reference_blk, ref_base + ref_off / 4 - first_indexes[s-start_sequence] / 4, sequence_bytes);
            ......
            //-- instead of memcmp --//
            int v = 0;
            char *p1 = (char *)sequence;
            char *p2 = (char *)reference_blk;
            int tmp_asd = sequence_bytes;
            while(tmp_asd!=0){
                v = *(p1++) - *(p2++);
                if(v!=0)
                    break;
                tmp_asd--;
            }

            if(v == 0){
                mat_count[s - (int)start_sequence]++;      /* Maintain count */
                mat_position[s - (int)start_sequence] = ref_off-first_indexes[s-start_sequence]; /* Record latest position */
            }
        }
        else if((int)(first_indexes[s-start_sequence] % 4 - ref_off % 4 )== 0){
            memcpy(reference_blk, ref_base + ref_off / 4 - first_indexes[s-start_sequence] / 4, sequence_bytes);
            .......
            //-- instead of memcmp --//
            int v = 0;
            char *p1 = (char *)sequence;
            char *p2 = (char *)reference_blk;
            int tmp_asd = sequence_bytes;
            while(tmp_asd!=0){
                v = *(p1++) - *(p2++);
                if(v!=0)
                    break;
                tmp_asd--;
            }
            if(v == 0){
                mat_count[s - (int)start_sequence]++;      /* Maintain count */
                mat_position[s - (int)start_sequence] = ref_off-first_indexes[s-start_sequence]; /* Record latest position */
            }
        }
        else
        {
            int shamt = 8 - (first_indexes[s-start_sequence] % 4 - ref_off % 4) * 2;

            memcpy(reference_blk, ref_base + ref_off / 4 - first_indexes[s-start_sequence] / 4 - 1, 32);
            ......
            //-- instead of memcmp --//
            int v = 0;
            char *p1 = (char *)sequence;
            char *p2 = (char *)reference_blk;
            int tmp_asd = sequence_bytes;
            while(tmp_asd!=0){
                v = *(p1++) - *(p2++);
                if(v!=0)
                    break;
                tmp_asd--;
            }

            if (v == 0){
                mat_count[s - (int)start_sequence]++;      /* Maintain count */
                mat_position[s - (int)start_sequence] = ref_off-first_indexes[s-start_sequence];/* Record latest position */
            }
        }
    }
}

}

You probably need to post the code for the kernel function `match` — Paul R, Mar 28 '12 at 06:41
That means you have memory violation. Run under debugger or memory checker. — Anycorn, Mar 28 '12 at 07:02
You need to call cudaThreadSynchronize() before you can call cudaGetLastError() because kernels are launched asynchronously. Your kernel probably has not even finished by the time you call cudaGetLastError(). And then you are mucking around with cuda memory. Not saying this is your problem, but it could help. — Apprentice Queue, Mar 28 '12 at 14:47
And have you checked that the numerous offsets that you calculate are valid? Especially when calling memcpy()? — Apprentice Queue, Mar 28 '12 at 15:07
It's worth pointing out to newcomers of this thread that my program only failed to stop working after I started to increase the number of grids/threads; I was violating memory long before then, one of my arrays was smaller than the others and I hadn't accounted for it - it was still working successfully at lower G/T counts. (admittedly with minimum error checking) — Phill, Feb 11 '16 at 09:09

score 47 · Answer 1 · answered Mar 31 '12 at 02:24

47

An unspecified launch failure is almost always a segfault. You've got an indexing mistake somewhere in your kernel, probably while accessing global memory.

I'd look through your code, but it's mildly incomprehensible...

answered Mar 31 '12 at 02:24

P O'Conbhui

1,203
1
9
16

score 31 · Answer 2 · answered Oct 20 '12 at 08:38

31

Compile your application with debug flags nvcc -G -g and try running your application inside cuda-memcheck or cuda-gdb. It might give you a hint where the problem might lie.

Just run

cuda-memcheck ./yourApp

answered Oct 20 '12 at 08:38

Przemyslaw Zych

2,000
1
21
24

score 0 · Answer 3 · answered Oct 23 '21 at 15:59

0

For me CUDA was generating "unspecified launch failure" due to an infinite recursion not detected by nvcc. The code was doing simply:

int f() {
  return f();
}

The backtrace in cuda-gdb showed some arbitrary code rather than the source of the error.

answered Oct 23 '21 at 15:59

Serge Rogatch

13,865
7
86
158

CUDA error message : unspecified launch failure

3 Answers3

Linked