1

I'm planning to launch a kernel from another kernel in CUDA. I'm calling the parent function from host. This device function parent will call the device function child. This child function will compute addition. This prompts an error.

calling a __global__ function("child") from a __global__ function("parent") is only allowed on the compute_35 architecture or above

My GPU is NVIDIA GeForce 940M. Compute capability is 5.0. How can I correct.

#include <stdio.h>
#include <stdlib.h>

#define N 5
#define BLOCK_DIM 3

__global__ void child (int *a, int *b, int *c) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    int index = col + row * N;

    if (col < N && row < N) {
        c[index] = a[index] + b[index];
    }

}

__global__ void parent (int *a, int *b, int *c) {
    dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
    dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y);

    if (threadIdx.x == 0) { 
        child<<<dimGrid, dimBlock>>>(a, b, c); 
        cudaThreadSynchronize(); 
    } 
    __syncthreads();
}

void printMatrix(int a[N][N] )
{
    for(int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            printf("%d\t", a[i][j] );
        }
        printf("\n");
    }
}

int main() {
    int a[N][N], b[N][N], c[N][N];
    int *dev_a, *dev_b, *dev_c;

    int size = N * N * sizeof(int);

    for(int i=0; i<N; i++)
        for (int j=0; j<N; j++){
            a[i][j] = rand() % 100;
            b[i][j] = rand() % 100;
        }

    printf("Matrix one\n");
    printMatrix(a);

    printf("Matrix two\n");
    printMatrix(b);

    cudaMalloc((void**)&dev_a, size);
    cudaMalloc((void**)&dev_b, size);
    cudaMalloc((void**)&dev_c, size);

    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

    parent<<<1,1>>>(dev_a, dev_b, dev_c);
    cudaDeviceSynchronize();
    cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

    printf("Sum of two matrix\n");
    printMatrix(c);

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
}
  • 1
    What's your build command? Probably you missed to set the architecture, which defaults to compute capability 2.0. Try `nvcc -arch=sm_50`. – havogt Mar 02 '16 at 08:27
  • works fine when `nvcc -arch=sm_50 -rdc=true kernel.cu` –  Mar 02 '16 at 08:58

0 Answers0