Using Cuda on WSL2 gives me "no kernel image is available for execution on the device."

Question

I am trying to use Cuda and Thrust in a C++ program on WSL2. I followed the instructions in here to enable Cuda on WSL2. Here is a small sample program:

first, I define:

export CUDA_LIBRARY_DIRECTORY=/usr/local/cuda-11.0/lib64
export CUDA_INCLUDE_DIRECTORY=/usr/local/cuda-11.0/include
export CUDACXX=/usr/local/cuda-11.0/bin/nvcc

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
project(proj LANGUAGES CXX CUDA)

set (CMAKE_CXX_STANDARD 14)

#### use cuda ####
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50;-lineinfo; -cudart=static; -Xptxas; -v)

include_directories($ENV{CUDA_INCLUDE_DIRECTORY})
link_directories($ENV{CUDA_LIBRARY_DIRECTORY})

ADD_EXECUTABLE(
proj 
src/cudafile.cu
src/main.cpp)

main.cpp

#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include<thrust/device_ptr.h>

void func(int size, int* a1, int* a2, int* a3);
void FillWithValue(int* arr, int size, int val);

int main()
{

    int size=1000;
    int *arr1, *arr2, *arr3;
    
    cudaMalloc((void**)&arr1, size * sizeof(int));
    FillWithValue(arr1,size,1);

    cudaMalloc((void**)&arr2, size * sizeof(int));
    FillWithValue(arr2,size,2);

    cudaMalloc((void**)&arr3, size * sizeof(int));

    int* harr = new int [size];
    cudaMemcpy(harr,arr1,size*sizeof(int),cudaMemcpyDeviceToHost);
    fprintf(stdout, "%d\n",harr[0]);


    func(size, arr1, arr2, arr3);
    cudaError_t err = cudaGetLastError();
    if (cudaSuccess != err)
        fprintf(stderr, "Cuda error: %s.\n", cudaGetErrorString(err));
    

    return 1;

}

cudafile.cu

#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include<thrust/device_ptr.h>

#define blocksize 512
#define maxblocks 65535

__global__ void funcKernel(int size, int* a1, int* a2, int* a3)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    while (i < size)
    {
        a3[i]=a1[i]+a2[i];
    }
}

void func(int size, int* a1, int* a2, int* a3)
{
    int gridsize = size / blocksize + 1;
    if (gridsize > maxblocks) gridsize = maxblocks;

    funcKernel << <gridsize, blocksize >> > (size, a1, a2, a3);
}

void FillWithValue(int* arr, int size, int val)
{

    thrust::device_ptr<int> d = thrust::device_pointer_cast(arr);
    thrust::fill(d, d + size, val);
}

output

0
Cuda error: no kernel image is available for execution on the device.

Now, the output of the first fprintf proves that the Thrust fill function fails to fill the arrays, and cudaGetLastError() catches an error, proving that the kernel also fails.

This is the verbose cmake build:

cmake ..

-- The CXX compiler identification is GNU 9.3.0
-- The CUDA compiler identification is NVIDIA 11.0.221
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Check for working CUDA compiler: /usr/local/cuda-11.0/bin/nvcc
-- Check for working CUDA compiler: /usr/local/cuda-11.0/bin/nvcc -- works
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Configuring done
-- Generating done
-- Build files have been written to: /mnt/d/work/wsl2-projects/tests/kernels/build

make

/usr/bin/cmake -S/mnt/d/work/wsl2-projects/tests/kernels -B/mnt/d/work/wsl2-projects/tests/kernels/build --check-build-system CMakeFiles/Makefile.cmake 0
/usr/bin/cmake -E cmake_progress_start /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles/progress.marks
make -f CMakeFiles/Makefile2 all
make[1]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
make -f CMakeFiles/proj.dir/build.make CMakeFiles/proj.dir/depend
make[2]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
cd /mnt/d/work/wsl2-projects/tests/kernels/build && /usr/bin/cmake -E cmake_depends "Unix Makefiles" /mnt/d/work/wsl2-projects/tests/kernels /mnt/d/work/wsl2-projects/tests/kernels /mnt/d/work/wsl2-projects/tests/kernels/build /mnt/d/work/wsl2-projects/tests/kernels/build /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles/proj.dir/DependInfo.cmake --color=
Scanning dependencies of target proj
make[2]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
make -f CMakeFiles/proj.dir/build.make CMakeFiles/proj.dir/build
make[2]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
[ 33%] Building CUDA object CMakeFiles/proj.dir/src/cudafile.cu.o
/usr/local/cuda-11.0/bin/nvcc     -x cu -c /mnt/d/work/wsl2-projects/tests/kernels/src/cudafile.cu -o CMakeFiles/proj.dir/src/cudafile.cu.o
[ 66%] Building CXX object CMakeFiles/proj.dir/src/main.cpp.o
/usr/bin/c++   -I/usr/local/cuda-11.0/include  -std=gnu++14 -o CMakeFiles/proj.dir/src/main.cpp.o -c /mnt/d/work/wsl2-projects/tests/kernels/src/main.cpp
[100%] Linking CXX executable proj
/usr/bin/cmake -E cmake_link_script CMakeFiles/proj.dir/link.txt --verbose=1
/usr/bin/c++    -rdynamic CMakeFiles/proj.dir/src/cudafile.cu.o CMakeFiles/proj.dir/src/main.cpp.o  -o proj   -L/usr/local/cuda-11.0/lib64  -L/usr/local/cuda-11.0/targets/x86_64-linux/lib/stubs  -L/usr/local/cuda-11.0/targets/x86_64-linux/lib  -lcudadevrt -lcudart_static -lrt -lpthread -ldl
make[2]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
[100%] Built target proj
make[1]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
/usr/bin/cmake -E cmake_progress_start /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles 0

Is it something related to my GPU unmatching the Cuda version? I thought of downgrading to Cuda 10 or 9, but I don't know how to install it exactly like here, so that it does not replace the driver with another Nvidia driver.

Additional Info:

GeForce GTX 950M
Windows 11 Home. build 22000.51.
WSL2: Ubuntu-20.04
Cuda compilation tools, release 9.1, V9.1.85

you might want to provide the verbose output from the cmake build. — Robert Crovella, Jul 07 '21 at 14:20
@RobertCrovella ok. I edited the question. I hope this is what you asked for. — mradwan, Jul 08 '21 at 08:22
Your `CUDA_NVCC_FLAGS` that you specified in your `CMakeLists.txt` file are not getting applied during compilation with `nvcc`. Most importantly you specified this compile switch: `-gencode arch=compute_50,code=sm_50` which is correct for your GPU, but it is not getting used in your cmake verbose output. This is the problem. CUDA 11 compiles for compute capability 5.2 by default, and if you leave it that way, those codes won't run on a cc5.0 GPU which is what you have. This is not a problem with CUDA or wsl2, but a problem with your usage of cmake. — Robert Crovella, Jul 08 '21 at 14:18
see [here](https://stackoverflow.com/questions/53256405/howto-pass-flag-to-nvcc-compiler-in-cmake) — Robert Crovella, Jul 08 '21 at 14:20
@RobertCrovella It works. Thanks a lot ! I set CMAKE_CUDA_FLAGS instead of CUDA_NVCC_FLAGS. — mradwan, Jul 08 '21 at 15:23
@RobertCrovella Should I write an answer and accept it? or just leave it like that? — mradwan, Jul 08 '21 at 15:29
writing an answer to explain what you did for others is a good idea. Or after a while if you don't write an answer, I'll mark this question as a duplicate of the other one I linked. — Robert Crovella, Jul 08 '21 at 15:34

mradwan · Accepted Answer · 2021-07-08T16:33:53.267

Based on the comment of Robert Crovella, I managed to get the program to run properly with the right output and no errors.

In CMakeLists.txt, I used

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_50,code=sm_50 -lineinfo -cudart=static -Xptxas -v")

instead of

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50;-lineinfo; -cudart=static; -Xptxas; -v)

and now the output is

Using Cuda on WSL2 gives me "no kernel image is available for execution on the device."

1 Answers1