I'm new to CUDA,but so far it drives me mad. Running the following code produces: "CUDA error: unspecified launch failure". I can't get the reason of that error,the only thing I noticed is if I lower iterations count of the main loop by let's say two orders of magnitude,then it runs well. There is the example( don't try to find logic behind those operations,it's a very simplified code,made just for purpose of demonstration).
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void Test()
{
int table0[256];
int table1[256];
int table2[256];
int table3[256];
for (int i = 0; i < 256; i++)
{
table0[i] = i;
table1[i] = i;
table2[i] = i;
table3[i] = i;
}
int input[4];
for (int i = 0; i < 4; i++)
input[i] = i;
int res0, res1, res2, res3;
for (int i = 0; i < 10000000; i++)
{
res0 = table0[(unsigned char)(input[0] >> 24)] ^ table1[(unsigned char)(input[1] >> 16)] ^ table2[(unsigned char)(input[2] >> 8)] ^ table3[(unsigned char)(input[3])];
res1 = table0[(unsigned char)(input[1] >> 24)] ^ table1[(unsigned char)(input[2] >> 16)] ^ table2[(unsigned char)(input[3] >> 8)] ^ table3[(unsigned char)(input[0])];
res2 = table0[(unsigned char)(input[2] >> 24)] ^ table1[(unsigned char)(input[3] >> 16)] ^ table2[(unsigned char)(input[0] >> 8)] ^ table3[(unsigned char)(input[1])];
res3 = table0[(unsigned char)(input[3] >> 24)] ^ table1[(unsigned char)(input[0] >> 16)] ^ table2[(unsigned char)(input[1] >> 8)] ^ table3[(unsigned char)(input[2])];
input[0] = res0;
input[1] = res1;
input[2] = res2;
input[3] = res3;
}
}
cudaError_t TestWithCUDA()
{
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
Test<<<1,1>>>();
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
printf("CUDA error: %s\n", cudaGetErrorString(error));
Error:
cudaDeviceReset();
return cudaStatus;
}
int main()
{
cudaError_t cudaStatus = TestWithCUDA();
if (cudaStatus != cudaSuccess)
printf("The test has failed!\n");
else
printf("Done!\n");
return 0;
}
Does anyone have any ideas?