In order to know my compute capability, I have this small program :
cudaDeviceProp prop;
int count;
cudaGetDeviceCount( &count );
for (int i=0; i< count; i++) {
cudaGetDeviceProperties( &prop, i );
printf( "Compute capability: %d.%d\n", prop.major, prop.minor );
}
and that prints 3.5 for all my GPUs.
Now I try to compile the following toy program (by nvcc -c):
__global__ void add_device(float *a, float *b, float *c, int n)
{
int i = blockIdx.x;
if (i < n) {
c[i] = a[i] + b[i];
}
}
__global__ void add_kernel(float *a, float *b, float *c, int n)
{
add_device(a, b, c, n);
}
void gpu_add(float *a, float *b, float *c, int n)
{
add_kernel<<<n, 1>>>( a, b, c, n );
}
but my compilation results in the following error :
calling a __global__ function("add_device") from a __global__ function("add_kernel") is only allowed on the compute_35 architecture or above
What am I doing wrong?