I use the checkCudaErrors helper function from the CUDA Toolkit Samples. See "helper_cuda.h". I am perplexed as to why the launch error from this example is not caught by checkCudaErrors. The error is too many threads are launched (2048).
From Debug (linux gdb), the console prints (stderr in red) "warning: Cuda API error detected: cudaLaunch returned (0x9)".
Whereas when I execute either the Release or Debug builds from a Bash shell, no error is printed by checkCudaErrors.
Why is this?
My expectation is the error would be caught and printed at the D2H memcpy call immediately proceeding the launch. Is this incorrect?
Minimal reproducible example:
#include <cuda.h>
#include "helper_cuda.h"
__global__ void BusyIncrementKernel( const size_t increments, float * result){
float tmp = 0;
for ( size_t i = 0; i < increments; ++i ){ tmp += 1; }
const int j = threadIdx.x + blockIdx.x*blockDim.x;
if ( j == 0 ){ *result = tmp; }
}
int main( int argc, char * argv[] ){
unsigned int blockDim = 2048;
dim3 block{ blockDim, 1, 1};
dim3 grid{ 1, 1, 1};
float * dResult;
checkCudaErrors( cudaMalloc( &dResult, sizeof(float) ));
BusyIncrementKernel<<< grid, block >>>( 10000000, dResult );
float result;
checkCudaErrors( cudaMemcpy( &result, dResult, sizeof(float), cudaMemcpyDeviceToHost ));
checkCudaErrors( cudaFree( dResult ));
checkCudaErrors( cudaDeviceSynchronize() );
fprintf( stderr,"result: %f\n", result );
return 0;
}