8

I have a CUDA template library, in which one function is actually not a template, but is defined within a .cuh header. (vector_add_kernel in kernel.cuh below.)

If multiple .cu files include kernel.cuh and call vector_add[_kernel], it will result in multiple definition errors at link-time. In C++, one can use the inline qualifier to avoid such errors.

However, inline __global__ ... - while preventing the multiple definition errors on my system - results in a warning that the inline qualifier has been ignored.

Q: Is there a better way to avoid the multiple definition error, or a way to suppress this warning only for this function? And is inline __global__ even safe, or might other host compilers truly ignore it?

I could simply move the vector_add_kernel to a separate .cu file, but it would be the only non-header file. I could also template vector_add_kernel, but in my library that makes little sense.

A (not-so-minimal, sorry) working example (tested with CUDA 7.0, gcc 4.7.2 on Debian) is below.

To clarify, main.cu is some user's code; lib.cu is some external library not belonging to me; and kernel.cuh is part of my template library. So, both the external lib and the user's main are using my template library, kernel.cuh - but separately.

main.cu:

#include "lib.hpp"
#include "kernel.cuh"

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#include <cstddef>
#include <cstdlib>
#include <iostream>

int main(void)
{
    const size_t N = 1u << 7;

    float* a = (float*) malloc(N * sizeof(float));
    float* b = (float*) malloc(N * sizeof(float));
    float* c = (float*) malloc(N * sizeof(float));

    for (int i = 0; i < N; ++i) {
        a[i] = b[i] = 2.0f * i;
    }

    lib_vector_add(a, b, c, N);
    for (int i = 0; i < N; ++i) {
        if (c[i] != 2.0f * i + 2.0f * i)
            std::cout << "Error, lib, element " << i << std::endl;
    }

    thrust::device_vector<float> d_a(a, a + N);
    thrust::device_vector<float> d_b(b, b + N);
    thrust::device_vector<float> d_c(N);

    vector_add(d_a, d_b, d_c);
    thrust::host_vector<float> h_c = d_c;
    for (int i = 0; i < N; ++i) {
        if (h_c[i] != 2.0f * i + 2.0f * i)
            std::cout << "Error, element " << i << std::endl;
    }
}

lib.cu,

#include <kernel.cuh>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

void lib_vector_add(float* a, float* b, float* c, size_t N)
{
    thrust::host_vector<float> h_a(a, a + N);
    thrust::host_vector<float> h_b(b, b + N);

    thrust::device_vector<float> d_a = h_a;
    thrust::device_vector<float> d_b = h_b;
    thrust::device_vector<float> d_c(N);

    vector_add(d_a, d_b, d_c);

    thrust::host_vector<float> h_c = d_c;
    for (int i = 0; i < N; ++i)
    {
        c[i] = h_c[i];
    }
}

lib.hpp,

#pragma once

#include <cstddef>

void lib_vector_add(float*, float*, float*, size_t);

kernel.cuh - this form results in a linker error. Uncomment the first inline to get a working code.

#pragma once

#include <thrust/device_vector.h>
#include <cstddef>

// inline keyword avoids multiple definition errors, but produces warnings.
// UNCOMMENT TO GET A WORKING EXECUTABLE.
// inline
__global__ void vector_add_kernel(
    const float *const a,
    const float *const b,
    float *const c,
    const size_t N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    while (tid < N)
    {
        c[tid] = a[tid] + b[tid];
        tid += blockDim.x * gridDim.x;
    }
}

// inline produces no warnings.
inline
void vector_add(
    const thrust::device_vector<float>& d_a,
    const thrust::device_vector<float>& d_b,
    thrust::device_vector<float>& d_c)
{
    const float *const a_ptr = thrust::raw_pointer_cast(d_a.data());
    const float *const b_ptr = thrust::raw_pointer_cast(d_b.data());
    float *const c_ptr = thrust::raw_pointer_cast(d_c.data());

    const size_t N = d_a.size();

    dim3 block(128);
    dim3 grid((N + 127) / 128);

    vector_add_kernel<<<grid, block>>>(a_ptr, b_ptr, c_ptr, N);
}

Makefile

OBJS = main.o lib.o
DEPS = kernel.cuh
CU_ARCH = -gencode arch=compute_20,code=sm_20

all: app

app: $(OBJS)
    nvcc $(CU_ARCH) $(OBJS) -o app

%.o: %.cu $(DEPS)
    nvcc $(CU_ARCH) -dc -I./ $< -o $@

clean:
    -rm *.o
Sam
  • 557
  • 6
  • 20
  • The better way is to get function definitions out of header files. That is general advice, not unique to CUDA. You've already got `lib.cu`, why not put it in there? – Robert Crovella Oct 09 '15 at 18:13
  • Because, in reality, only `kernel.cuh` is 'my' code. `lib.cu` is some external library, which uses my `kernel.cuh`, and `main.cu` is some unknown user's code, who is using both my `kernel.cuh` and the external `lib`. – Sam Oct 09 '15 at 18:14
  • Maybe using `#ifdef __CUDA_ARCH__` around your kernel definition would help? That way it will only get compiled when processed by nvcc. – void_ptr Oct 09 '15 at 19:11
  • @void_ptr that is not valid usage of `__CUDA_ARCH__` – Robert Crovella Oct 09 '15 at 20:58
  • What might work (aside from the decent solutions in the answer below) is to wrap the function definition in a `#ifdef FOO ... #endif` in `kernel.cuh`, then document it so the user (the person who writes `main.cu`) does `#define FOO` in `main.cu` before doing `#include kernel.cuh`. Then `FOO` won't have been defined in `lib.cu` so the function won't be defined twice. – Blair Houghton Oct 10 '15 at 05:16

1 Answers1

8

If you want to keep your current code organisation, you have a very simple solution which is to declare your kernel static (in place of your inline keyword). This will prevent the linker from complaining, but will however generate as many different versions of the kernel as there will be of compilation units (object files) where the kernel.cuh will have been included.

Another solution would be to templatise your kernel. I know you already dismissed this possibility, but you should reconsider it, since your kernel is a natural template for the float type of the input parameters...

Gilles
  • 9,269
  • 4
  • 34
  • 53