CUDA lambda expression in kernel

Question

I have trouble implementing lambda expression in CUDA kernel code, it compiles but fails at execution. I use Ubuntu 18.04 with CUDA 10.1 and compile with --expt-extended-lambda.

I'm just trying to implement some basic operation on custom device matrix like dot multiplication, addition, soustraction, etc in a clean and concise way.

I've tested thrust but it leaded to several device memory error when mixing with more complex custom CUDA kernel code. Manually CUDA allocated memory with cudaMalloc casted to a thrust::device_ptr and then use thrust routines didn't work well, I would prefer get rid of thrust.

Here is a sample of basic use of template expression that fails and I don't know why. The transform / transformProcess methods fail. Apparently the lambda expression passed with binaryFunction cannot be applied in device code.

EDIT 2 (fixed the code to have no compilation error)

Types.cuh

#ifndef TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH
#define TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH

#include <cuda_runtime.h>
#include <nvfunctional>
#include <iostream>
#include <vector>
#include <string>

typedef unsigned int uint;

inline bool check(int e, int iLine, const char *szFile) {
    if (e < 0) {
        std::cerr << "General error " << e << " at line " << iLine << " in file " << szFile << std::endl;
        return false;
    }
    return true;
}

#define ck(call) check(call, __LINE__, __FILE__)

template <typename precision>
struct CudaMatrix {
    typedef nvstd::function<precision(precision, precision)> binaryFunction;

    CudaMatrix(uint width, uint height) : width(width), height(height) { }

    __device__ __host__ uint size() const { return width * height; }

    uint       bytesSize() const { return size() * sizeof(precision); }
    void       fill(precision value);
    void       setValuesFromVector(const std::vector<precision> &vector);
    void       display(const std::string &name = "") const;
    CudaMatrix transform(const CudaMatrix &A, binaryFunction lambda);

    CudaMatrix  operator+=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x + y; }); }
    CudaMatrix  operator-=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x - y; }); }
    CudaMatrix  operator*=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x * y; }); }

    precision      *data;
    uint           width,
                   height;
};

#endif //TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH

Types.cu


#include "Types.cuh"

/**
 * Device code to set a matrix value to the given one
 *
 * @tparam precision - The matrix precision
 *
 * @param matrix - The matrix to set the value to
 * @param value - The value to set
 */
template <typename precision>
__global__ void fillProcess(CudaMatrix<precision> matrix, precision value)
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    if (x >= matrix.size()) {
        return;
    }

    *(matrix.data + x) = value;
}

/**
 * Device code to apply a function f for each element of matrix A and B with A = f(A, B)
 *
 * @tparam precision - The matrix precision
 *
 * @param A - The matrix A to store the result in
 * @param B - The matrix B to compute the result from
 * @param transform - The function to apply on each A'elements such as A(i) = transform(A(i), B(i))
 */
template<typename precision>
__global__ void transformProcess(               CudaMatrix<precision>                 A,
                                                CudaMatrix<precision>                 B,
                                 const typename CudaMatrix<precision>::binaryFunction &transform
) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    if (x >= A.size()) {
        return;
    }

    *(A.data + x) = transform(*(A.data + x), *(B.data + x));
}

/**
 * Display the matrix
 *
 * @tparam precision - The matrix precision
 *
 * @param name - The matrix name
 */
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name) const
{
    precision *hostValues;

    ck(cudaMallocHost(&hostValues, bytesSize()));
    ck(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));

    std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of " << typeid(precision).name()
              << "\n\n";

    for (int i = 0; i < height; ++i) {
        std::cout << "{ ";

        for (int j = 0; j < width - 1; ++j) {
            std::cout << *(hostValues + i * width + j) << ", ";
        }

        std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
    }

    std::cout << std::endl;

    ck(cudaFreeHost(hostValues));
}

/**
 * Fill the matrix with the given value
 *
 * @tparam precision - The matrix precision
 *
 * @param value - The value to set all matrix's elements with
 */
template <typename precision>
void CudaMatrix<precision>::fill(precision value)
{
    const uint threadsPerBlock = 128;
    const uint numBlock        = size() / threadsPerBlock + 1;

    fillProcess<<< numBlock, threadsPerBlock >>>(*this, value);
}

/**
 * Set the matrix values in device CUDA memory from a host standard vector
 *
 * @param vector - The values to set
 */
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> &vector)
{
    ck(cudaMemcpy(data, vector.data(), vector.size() * sizeof(precision), cudaMemcpyHostToDevice));
}

/**
 * Apply the function "fn" to all elements of the current matrix such as *this[i] = fn(*this[i], A[i])
 *
 * @tparam precision - The matrix precision
 *
 * @param A - The input matrix A
 * @param op - The binary function to apply
 *
 * @return This
 */
template<typename precision>
CudaMatrix<precision> CudaMatrix<precision>::transform(const CudaMatrix &A, binaryFunction fn)
{
    const uint threadsPerBlock = 128;
    const uint numBlock        = size() / threadsPerBlock + 1;

    transformProcess<<< numBlock, threadsPerBlock >>>(*this, A, fn);

    return *this;
}

// Forward template declarations

template struct CudaMatrix<double>;
template struct CudaMatrix<float>;
template struct CudaMatrix<int>;

main.cpp

#include "Types.cuh"

int main(int argc, char **argv)
{
    // Allocate memory
    CudaMatrix<double> m1(3, 3);
    CudaMatrix<double> m2(3, 3);

    ck(cudaMalloc(&m1.data, m1.bytesSize()));
    ck(cudaMalloc(&m2.data, m2.bytesSize()));

    // Test here

    m1.setValuesFromVector({1, 1, 1, 2, 2, 2, 3, 3, 3});
    m2.fill(10);

    m1.display("m1");
    m2.display("m2");

    m1 *= m2;

    m1.display("m1 * m2");

    m1 += m2;

    m1.display("m1 + m2");

    // Clean memory

    ck(cudaFree(m1.data));
    ck(cudaFree(m2.data));

    return EXIT_SUCCESS;
}

Output

Matrix m1 3 x 3 pixels of d

{ 1, 1, 1 }
{ 2, 2, 2 }
{ 3, 3, 3 }

Matrix m2 3 x 3 pixels of d

{ 10, 10, 10 }
{ 10, 10, 10 }
{ 10, 10, 10 }

Matrix m1 * m2 3 x 3 pixels of d

{ 1, 1, 1 }
{ 2, 2, 2 }
{ 3, 3, 3 }

Matrix m1 + m2 3 x 3 pixels of d

Segmentation fault (core dumped)

EDIT 3

Robert Crovella's solution with "nested" template strategy is working nicely.

Did you have an actual question to ask? As a remark, CUDA GPU operations cannot cause a host side segmentation fault. That would require something like out of bounds host memory access or something hosing the stack. So it isn't obvious (a) what you are trying to ask and (b) what the segfault you have shown has to do with the CUDA code you have posted — talonmies, Jul 12 '19 at 10:19
Hi, I think that the segfault is caused by an illegal operation on device memory due to the operation performed by the kernel function `transformProcess`. The question is not about the segfault but the reason why the applying of the given lambda function caused this segfault. This line `*(A.data + x) = transform(*(A.data + x), *(B.data + x));` fails and I asked why. The expected result is `A(i) = A(i) + B(i)`. This is the question I asked and the segfault is just the result of this misbehavior and this is obviously due to the CUDA kernel code that modify the device memory. — Romain Laneuville, Jul 12 '19 at 12:46
An illegal GPU memory operation *cannot under any circumstances* cause a host segfault — talonmies, Jul 12 '19 at 12:50
If I comment `m1 *= m2;` in **main.cpp** I have no segfault. This function calls `transform` that calls `transformProcess` that leads to a segfault. Better, if I comment `*(A.data + x) = transform(*(A.data + x), *(B.data + x));`, there is no segfaut. Of course, If I don't call a memory transfer between device and host with `cudaMemcpy` there is no segfault because the corrupt device memory stay on device. This is obvious and I'm sure you perfectly understood that when I said "*the segfault is caused by an illegal operation on device memory*" it meant that the device memory is corrupted. — Romain Laneuville, Jul 12 '19 at 13:31
See item 1 [here](https://stackoverflow.com/help/on-topic). If you are asking why your code isn't working, you are supposed to provide a [mcve]. What you have shown so far is not one. I have voted to close your question. — Robert Crovella, Jul 12 '19 at 14:53
The code you have provided is incomplete and uncompilable. I can't help with a runtime error when I can even compile the code. I have also voted to close this — talonmies, Jul 12 '19 at 15:01
@RobertCrovella The 3 files I provided (**Types.cuh**, **Types.cu** and **main.cpp**) are exactly the minmal files to reproduce the bug, ie: the segfault (more precisely the fail of `transform` function in device code). Do you need the *makefile* also ? This must be a joke. — Romain Laneuville, Jul 12 '19 at 15:04
Did you actually try to compile precisely and only what you have shown here? Because that is what I tried to do. It didn't work. If you try it, you will also see that it doesn't work. After I added about 4 or 5 things that are needed to your `Types.cuh`, I ended up with a missing function implementation for `setValuesFromVector`. Yep, I could probably fix that too. If you believe that I am supposed to have to do all this to try and run your code, then I'm sorry, I disagree with you. I believe that claiming your code is incomplete is a **very** defensible statement. — Robert Crovella, Jul 12 '19 at 15:25
You are right for the missing definition of `setValuesFromVector`, I deleted it when I shrinked the code to the minimal one to reproduce the missbehavior because the `CudaMatrix` struct is wider than that with some CUBLAS functions etc. I apologize from that. Monday I will create a simple github project with those 3 files and a cmake that will compile fine. Sorry again, not easy to extract code from a large project. — Romain Laneuville, Jul 13 '19 at 16:37
There's no need to provide anything further in this question, in my opinion. I've already created a working example from your code and addressed the problematic issues in your code, and provided you a solution. The topics in the last paragraph of your question are explained and fixed. If your question is still not answered somehow, my suggestion would be to create a new question. — Robert Crovella, Jul 13 '19 at 17:27
I think my question is answered by your 1) concerning `nvstd::function` caveat. Your solution using template parameter should work in my scenario, I will dig up a little bit for an elegant solution but I don't think there is one better than using template. Thanks. — Romain Laneuville, Jul 14 '19 at 18:40

Robert Crovella · Accepted Answer · 2019-07-13T05:07:08.763

Perhaps the most important problem in your code was that you were attempting to wrap a device lambda in a nvstd::function, and then pass and use that in device code, and that is not allowed: "One caveat: you still cannot pass nvstd::function objects initialized in host code to device code (and vice versa)."
You were including Types.cuh in main.cpp, but Types.cuh contains device code, and constructs such as __device__, which are not recognized by the host compiler. A file with the name extension of .cpp will be mostly processed by the host compiler, by default. It's possible of course, that you were passing -x cu compiler switch to nvcc to handle this in your Makefile, but I don't know that, so for the benefit of future readers I'm pointing this out. In my "fixed" code below, I made no changes to your main.cpp other than to rename it to main.cu to address this.

You had some incorrect range checking in Types.cu in at least 2 kernels:

__global__ void fillProcess(
...
    if (x > matrix.size()) {  // off-by-one size check
        return;
    }
...
__global__ void transformProcess( 
...
    if (x >  A.size()) {      // off-by-one size check
        return;
    }

the standard computer-science off-by-1 error.

There were at least half a dozen items missing from your code to get it to compile.

The item that required the most effort to fix is the first one. For this I elected to use a "nested" template strategy, in order to allow templating for the lambdas, which is (approximately) the only way I know of to transfer a lambda from host to device. I imagine there are other possible approaches, and you might consider the use of functors instead, for the binary functions you have here (since they all have the same input-output prototype).

The following has these issues addressed, and gives sensible output.

$ cat Types.cuh
#include <cublas_v2.h>
#include <string>
#include <vector>
#include <cassert>
#define ck(x) x

typedef unsigned int uint;


template <typename precision>
struct CudaMatrix {
    //typedef nvstd::function<precision(precision, precision)> binaryFunction;

    CudaMatrix(uint width, uint height, cublasHandle_t cublasHandle = nullptr) :
               width(width), height(height), cublasHandle(cublasHandle) { }

    __device__ __host__ uint size() const { return width * height; }

    uint       bytesSize() const { return size() * sizeof(precision); }
    void       fill(precision value);
    void       display(const std::string &name = "") const;
    void       setValuesFromVector(const std::vector<precision> vals) const;
    template <typename T>
    CudaMatrix transform(const CudaMatrix &A, T fn);

    CudaMatrix& operator=(CudaMatrix m);
    CudaMatrix  operator+=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x + y; }); }
    CudaMatrix  operator-=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x - y; }); }
    CudaMatrix  operator*=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x * y; }); }

    precision      *data;
    uint           width,
                   height;
    cublasHandle_t cublasHandle;
};
$ cat Types.cu
#include "Types.cuh"
#include <iostream>
/**
 * Device code to set a matrix value to the given one
 *
 * @tparam precision - The matrix precision
 *
 * @param matrix - The matrix to set the value to
 * @param value - The value to set
 */
template <typename precision>
__global__ void fillProcess(CudaMatrix<precision> matrix, precision value)
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    if (x >= matrix.size()) { 
        return;
    }

    *(matrix.data + x) = value;
}

/**
 * Device code to apply a function f for each element of matrix A and B with A = f(A, B)
 *
 * @tparam precision - The matrix precision
 *
 * @param A - The matrix A to store the result in
 * @param B - The matrix B to compute the result from
 * @param transform - The function to apply on each A'elements such as A(i) = transform(A(i), B(i))
 */
template <typename precision, typename T>
__global__ void transformProcess(               CudaMatrix<precision>                 A,
                                                CudaMatrix<precision>                 B,
                                                T                                     transform
) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;

    if (x >= A.size()) {  
        return;
    }

    // transform(*(A.data + x), *(B.data + x)) seems to return nothing but do not crash ...

    *(A.data + x) = transform(*(A.data + x), *(B.data + x));
}

/**
 * Apply the function "fn" to all elements of the current matrix such as *this[i] = fn(*this[i], A[i])
 *
 * @tparam precision - The matrix precision
 *
 * @param A - The input matrix A
 * @param op - The binary function to apply
 *
 * @return This
 */
template<typename precision> template<typename T>
CudaMatrix<precision> CudaMatrix<precision>::transform(const CudaMatrix &A, T fn)
{
    const uint threadsPerBlock = 128;
    const uint numBlock        = size() / threadsPerBlock + 1;

    assert(width == A.width);
    assert(height == A.height);

    transformProcess<<< numBlock, threadsPerBlock >>>(*this, A, fn);

    return *this;
}

/**
 * Fill the matrix with the given value
 *
 * @tparam precision - The matrix precision
 *
 * @param value - The value to set all matrix's elements with
 */
template <typename precision>
void CudaMatrix<precision>::fill(precision value)
{
    const uint threadsPerBlock = 128;
    const uint numBlock        = size() / threadsPerBlock + 1;

    // @fixme thrust fill method gives error after 1 iteration
    // thrust::device_ptr<precision> thrustPtr = thrust::device_pointer_cast(data);
    // thrust::uninitialized_fill(thrustPtr, thrustPtr + size(), value);

    fillProcess<<< numBlock, threadsPerBlock >>>(*this, value);
}
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> vals) const
{

  cudaMemcpy((*this).data, vals.data(), vals.size()*sizeof(precision), cudaMemcpyHostToDevice);

}
/**
 * Display the matrix
 *
 * @tparam precision - The matrix precision
 *
 * @param name - The matrix name
 */
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name) const
{
    precision *hostValues;

    ck(cudaMallocHost(&hostValues, bytesSize()));
    ck(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));

    std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of " << typeid(precision).name()
              << "\n\n";

    for (int i = 0; i < height; ++i) {
        std::cout << "{ ";

        for (int j = 0; j < width - 1; ++j) {
            std::cout << *(hostValues + i * width + j) << ", ";
        }

        std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
    }

    std::cout << std::endl;

    ck(cudaFreeHost(hostValues));
}
template class CudaMatrix<double>;
$ cat main.cu
#include "Types.cuh"

int main(int argc, char **argv)
{
    // Allocate memory
    cublasHandle_t cublasHandle = nullptr;

    cublasCreate(&cublasHandle);

    CudaMatrix<double> m1(3, 3, cublasHandle);
    CudaMatrix<double> m2(3, 3, cublasHandle);

    ck(cudaMalloc(&m1.data, m1.bytesSize()));
    ck(cudaMalloc(&m2.data, m2.bytesSize()));

    // Test here

    m1.setValuesFromVector({1, 1, 1, 2, 2, 2, 3, 3, 3});
    m2.fill(10);

    m1.display("m1");
    m2.display("m2");

    // Fails here
    m1 *= m2;

    m1.display("m1 * m1");

    // Clean memory

    cublasDestroy(cublasHandle);

    ck(cudaFree(m1.data));
    ck(cudaFree(m2.data));

    return EXIT_SUCCESS;
}
$ nvcc -std=c++11  -o test main.cu Types.cu --expt-extended-lambda -lcublas -lineinfo
$ cuda-memcheck ./test
========= CUDA-MEMCHECK
Matrix m1 3 x 3 pixels of d

{ 1, 1, 1 }
{ 2, 2, 2 }
{ 3, 3, 3 }

Matrix m2 3 x 3 pixels of d

{ 10, 10, 10 }
{ 10, 10, 10 }
{ 10, 10, 10 }

Matrix m1 * m1 3 x 3 pixels of d

{ 10, 10, 10 }
{ 20, 20, 20 }
{ 30, 30, 30 }

========= ERROR SUMMARY: 0 errors
$

Hello, I tested your solution (in new project with only the code you provide) but I have a compilation error: `error: ‘CudaMatrix CudaMatrix::transform(const CudaMatrix&, lambdaT) [with lambdaT = CudaMatrix::operator*=(const CudaMatrix&) [with precision = double]::; precision = double]’, declared using local type ‘CudaMatrix::operator*=(const CudaMatrix&) [with precision = double]::’, is used but never defined [-fpermissive]` — Romain Laneuville, Jul 15 '19 at 12:28
This is because your main is a main.cpp file, not a main.cu file. You need to make your main a main.cu file, and compile it with nvcc, as I indicated in item 2 in my answer. The reason why it must be a .cu file and processed by `nvcc` is also indicated in item 2 in my answer. — Robert Crovella, Jul 15 '19 at 13:33
That was it, never had this kind of problem before. CMake makes me stupid. Thank you again. — Romain Laneuville, Jul 15 '19 at 15:12

CUDA lambda expression in kernel

1 Answers1