I have some simple cuda code that I am compiling to a static library using nvcc
, and some user code that I am compiling with g++
and linking against the previously compiled static library. When attempting link, I get linker errors for things like cudaMalloc
even if I use the -cudart static
option in the nvcc
compile command line.
Here is my code:
//kern.hpp
#include <cstddef>
class Kern
{
private:
float* d_data;
size_t size;
public:
Kern(size_t s);
~Kern();
void set_data(float *d);
};
//kern.cu
#include <iostream>
#include <kern.hpp>
__global__ void kern(float* data, size_t size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < size)
{
data[idx] = 0;
}
}
Kern::Kern(size_t s) : size(s)
{
cudaMalloc((void**)&d_data, size*sizeof(float));
}
Kern::~Kern()
{
cudaFree(d_data);
}
void Kern::set_data(float* d)
{
size_t grid_size = size;
std::cout << "Starting kernel with grid size " << grid_size << " and block size " << 1 <<
std::endl;
kern<<<grid_size, 1>>>(d_data, size);
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess)
std::cout << "ERROR: " << cudaGetErrorString(err) << std::endl;
cudaDeviceSynchronize();
cudaMemcpy((void*)d, (void*)d_data, size*sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
}
//main.cpp
#include <iostream>
#include <kern.hpp>
int main(int argc, char** argv)
{
std::cout << "starting" << std::endl;
Kern k(256);
float arr[256];
k.set_data(arr);
bool ok = true;
for(int i = 0; i < 256; ++i) ok &= arr[i] == 0;
std::cout << (ok ? "done" : "wrong") << std::endl;
}
I am compiling the kern with nvcc
as follows:
nvcc -I ./ -lib --compiler-options '-fPIC' -o libkern.a kern.cu -cudart static
And then main with g++
as follows:
g++ -o main main.cpp -I ./ -L. -L/opt/cuda/lib64 -lkern
Which produces the errors:
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `Kern::Kern(unsigned long)':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x4d): undefined reference to `cudaMalloc'
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `Kern::~Kern()':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x6b): undefined reference to `cudaFree'
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `Kern::set_data(float*)':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x152): undefined reference to `__cudaPushCallConfiguration'
/usr/bin/ld: tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x175): undefined reference to `cudaGetLastError'
/usr/bin/ld: tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x1a1): undefined reference to `cudaGetErrorString'
/usr/bin/ld: tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x1c6): undefined reference to `cudaDeviceSynchronize'
/usr/bin/ld: tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x1ee): undefined reference to `cudaMemcpy'
/usr/bin/ld: tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x1f3): undefined reference to `cudaDeviceSynchronize'
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `__cudaUnregisterBinaryUtil()':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x24e): undefined reference to `__cudaUnregisterFatBinary'
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `__nv_init_managed_rt_with_module(void**)':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x269): undefined reference to `__cudaInitModule'
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `__device_stub__Z4kernPfm(float*, unsigned long)':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x305): undefined reference to `__cudaPopCallConfiguration'
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `__nv_cudaEntityRegisterCallback(void**)':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x430): undefined reference to `__cudaRegisterFunction'
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `__sti____cudaRegisterAll()':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x44b): undefined reference to `__cudaRegisterFatBinary'
/usr/bin/ld: tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x47c): undefined reference to `__cudaRegisterFatBinaryEnd'
/usr/bin/ld: ./libkern.a(tmpxft_00001d30_00000000-8_kern.o): in function `cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*)':
tmpxft_00001d30_00000000-5_kern.cudafe1.cpp:(.text+0x4d9): undefined reference to `cudaLaunchKernel'
collect2: error: ld returned 1 exit status
But if I do the following:
g++ -o main main.cpp -I ./ -L. -L/opt/cuda/lib64 -lkern -lcudart
everything works.
My question is that since I have a -cudart static
in the nvcc
compiled line, shouldn't the libkern.a
already have the symbols for the cuda runtime resolved? Why is the -lcudart
still necessary in the g++
line?
Also, if I change libkern.a
to a shared object, not linking to the cuda runtime in the g++
line works. That is, the following works:
nvcc -I ./ -shared --compiler-options '-fPIC' -o libkern.so kern.cu -cudart static
g++ -o main main.cpp -I ./ -L. -L/opt/cuda/lib64 -lkern
Why does the static library version fail, but the shared object version work?
Note that I have tried the above scenarios after replacing -cudart static
with -lcudart_static
in the nvcc
line, and there was no change in behavior from making that replacement. This is to be expected as the two options essentially do the same thing right?
I am on linux.
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Oct_23_19:24:38_PDT_2019
Cuda compilation tools, release 10.2, V10.2.89
g++ --version
g++ (GCC) 10.1.0
Copyright (C) 2020 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Any help and/or clarification is much appreciated.