I have the following minimal .cu
file
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
#include <cstdio>
__global__ void test()
{
cublasHandle_t handle = nullptr;
cublasCreate(&handle);
}
int main(int, char**)
{
void * data = nullptr;
auto err = cudaMalloc(&data, 256);
printf("%s\n", cudaGetErrorString(err));
return 0;
}
As you can see, the test
kernel isn't even being called, however cudaMalloc
returns 30
(unknown error). The file is being compile with separable compilation (required for dynamic parallelism) and compute capability 5.2 (also tried 3.5 and 5.0, which didn't change anything). Removing the call to cublasCreate
causes cudaMalloc
to return 0
(no error).
What could be the cause? And how can I fix it? I need to call CUBLAS from a kernel using dynamic parallelism which is theoretically supported, so "just remove the call" is not an option.
Here is the corresponding CMakeLists.txt
:
cmake_minimum_required(VERSION 3.3 FATAL_ERROR)
project(CublasError)
find_package(CUDA REQUIRED)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS --gpu-architecture=compute_52 -Xptxas=-v)
list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -keep -O0)
cuda_add_executable(${PROJECT_NAME} main.cu)
cuda_add_cublas_to_target(${PROJECT_NAME})
# FindCUDA.cmake does not automatically add (or find) cudadevrt which is required when separable compilation is on
if(CUDA_SEPARABLE_COMPILATION)
get_filename_component(CUDA_LIB_PATH ${CUDA_CUDART_LIBRARY} DIRECTORY)
find_library(CUDA_cudadevrt_LIBRARY cudadevrt PATHS ${CUDA_LIB_PATH})
target_link_libraries(${PROJECT_NAME} ${CUDA_cudadevrt_LIBRARY})
endif()
Here is a set of theoretically similar compile commands (the result is at least the same):
nvcc -dc --gpu-architecture=compute_52 -m64 main.cu -o main.dc.obj
nvcc -dlink --gpu-architecture=compute_52 -m64 main.dc.obj -o main.obj
link /SUBSYSTEM:CONSOLE /LIBPATH:"%CUDA_PATH%\lib\x64" main.obj main.dc.obj cudart_static.lib cudadevrt.lib cublas.lib cublas_device.lib