Why do I get "Unspecified Launch failure" in CUDA program, multiplying 2 matrices

Question

I am new to CUDA. When I multiply the 1024x1024 matrix, and launch a kernel with:

multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);

But when I multiply a 2048 x 2048 matrix, with dim3(64,64,1) I get this error:

cudaDeviceSynchronize returned error code 4 after launching addKernel!
unspecified launch failure

From tinkering with the code, I think that the error is in this statement

result += a[row * size + ind] * b[col + size * ind];

in the part

b[col+size*ind]

If I take that out, I don't get a kernel launch error (just the wrong answer, obviously). I cannot figure out what's wrong. Any suggestions would be most appreciated. I am using Visual Studio 2013. I am using the debugger, but this does not help me find the error.

This seems to be a similar problem: cudaDeviceSynchronize returned error code 4 after launching

many thanks, here is the code:

cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size); 
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned     int size)
 {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row > size || col > size) return;

// target field in 1-D
int z = row * size + col;


int result = 0;
for (int ind = 0; ind < size  ; ++ind) {

    result += a[row * size + ind] * b[col + size * ind];

}
c[z] = result;

}

int main(){


const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];



for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
    a[i] = rand() % 2;
    b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
}


cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}

return 0;
}


cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned     int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;




// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for c allocated \n");

cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for a allocated \n");

cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
fprintf(stdout, "buffer for b allocated \n");


// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");


cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");

fprintf(stdout, "about to launch kernel \n");


// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);


fprintf(stdout, "kernel launched\n");


// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    ; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));

    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}



Error:
  cudaFree(dev_c);
  cudaFree(dev_a);
  cudaFree(dev_b);

  return cudaStatus;
}

You may be running into a WDDM [TDR timeout](http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm) on windows. Your code runs correctly (i.e. without runtime error) for me. If you are building this as a debug project (likely, since you are running it in debug) then the kernel will take even longer. — Robert Crovella, Feb 05 '16 at 19:43
Yes, that was it. I updated the WDDR TDR delay in the nsight monitor to 10 s, and now it runs fine. Thank you very much, I would never have found it. — Old_Mortality, Feb 06 '16 at 17:30
Why don't you provide an answer stating what you did. Later on you can come back and accept your own answer. That way the question will more likely be preserved and understood by future readers. — Robert Crovella, Feb 07 '16 at 15:00
Thank you, I have done that. Sorry, I am a quite inept with Stack Exchange. — Old_Mortality, Feb 08 '16 at 19:51

score 3 · Accepted Answer · answered Feb 08 '16 at 19:49

On Windows, I right clicked the NSight monitor icon in the system tray. There I chose Options>General. We see WDDM TDR delay. It was at 2, and I increased it to 10. Then, I ran my program again, and it worked fine. This was according to Robert's link (see above) http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm

Why do I get "Unspecified Launch failure" in CUDA program, multiplying 2 matrices

1 Answers1

Linked

Related