I have trouble implementing lambda expression in CUDA kernel code, it compiles but fails at execution. I use Ubuntu 18.04 with CUDA 10.1 and compile with --expt-extended-lambda
.
I'm just trying to implement some basic operation on custom device matrix like dot multiplication, addition, soustraction, etc in a clean and concise way.
I've tested thrust but it leaded to several device memory error when mixing with more complex custom CUDA kernel code. Manually CUDA allocated memory with cudaMalloc
casted to a thrust::device_ptr
and then use thrust routines didn't work well, I would prefer get rid of thrust.
Here is a sample of basic use of template expression that fails and I don't know why. The transform
/ transformProcess
methods fail. Apparently the lambda expression passed with binaryFunction
cannot be applied in device code.
EDIT 2 (fixed the code to have no compilation error)
Types.cuh
#ifndef TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH
#define TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH
#include <cuda_runtime.h>
#include <nvfunctional>
#include <iostream>
#include <vector>
#include <string>
typedef unsigned int uint;
inline bool check(int e, int iLine, const char *szFile) {
if (e < 0) {
std::cerr << "General error " << e << " at line " << iLine << " in file " << szFile << std::endl;
return false;
}
return true;
}
#define ck(call) check(call, __LINE__, __FILE__)
template <typename precision>
struct CudaMatrix {
typedef nvstd::function<precision(precision, precision)> binaryFunction;
CudaMatrix(uint width, uint height) : width(width), height(height) { }
__device__ __host__ uint size() const { return width * height; }
uint bytesSize() const { return size() * sizeof(precision); }
void fill(precision value);
void setValuesFromVector(const std::vector<precision> &vector);
void display(const std::string &name = "") const;
CudaMatrix transform(const CudaMatrix &A, binaryFunction lambda);
CudaMatrix operator+=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x + y; }); }
CudaMatrix operator-=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x - y; }); }
CudaMatrix operator*=(const CudaMatrix &m) { return transform(m, [=] __device__ (precision x, precision y) { return x * y; }); }
precision *data;
uint width,
height;
};
#endif //TEST_CUDA_DEVICE_LAMBDA_PROCESSING_TYPES_CUH
Types.cu
#include "Types.cuh"
/**
* Device code to set a matrix value to the given one
*
* @tparam precision - The matrix precision
*
* @param matrix - The matrix to set the value to
* @param value - The value to set
*/
template <typename precision>
__global__ void fillProcess(CudaMatrix<precision> matrix, precision value)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if (x >= matrix.size()) {
return;
}
*(matrix.data + x) = value;
}
/**
* Device code to apply a function f for each element of matrix A and B with A = f(A, B)
*
* @tparam precision - The matrix precision
*
* @param A - The matrix A to store the result in
* @param B - The matrix B to compute the result from
* @param transform - The function to apply on each A'elements such as A(i) = transform(A(i), B(i))
*/
template<typename precision>
__global__ void transformProcess( CudaMatrix<precision> A,
CudaMatrix<precision> B,
const typename CudaMatrix<precision>::binaryFunction &transform
) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
if (x >= A.size()) {
return;
}
*(A.data + x) = transform(*(A.data + x), *(B.data + x));
}
/**
* Display the matrix
*
* @tparam precision - The matrix precision
*
* @param name - The matrix name
*/
template <typename precision>
void CudaMatrix<precision>::display(const std::string &name) const
{
precision *hostValues;
ck(cudaMallocHost(&hostValues, bytesSize()));
ck(cudaMemcpy(hostValues, data, bytesSize(), cudaMemcpyDeviceToHost));
std::cout << "Matrix " << name << " " << width << " x " << height << " pixels of " << typeid(precision).name()
<< "\n\n";
for (int i = 0; i < height; ++i) {
std::cout << "{ ";
for (int j = 0; j < width - 1; ++j) {
std::cout << *(hostValues + i * width + j) << ", ";
}
std::cout << *(hostValues + (i + 1) * width - 1) << " }\n";
}
std::cout << std::endl;
ck(cudaFreeHost(hostValues));
}
/**
* Fill the matrix with the given value
*
* @tparam precision - The matrix precision
*
* @param value - The value to set all matrix's elements with
*/
template <typename precision>
void CudaMatrix<precision>::fill(precision value)
{
const uint threadsPerBlock = 128;
const uint numBlock = size() / threadsPerBlock + 1;
fillProcess<<< numBlock, threadsPerBlock >>>(*this, value);
}
/**
* Set the matrix values in device CUDA memory from a host standard vector
*
* @param vector - The values to set
*/
template <typename precision>
void CudaMatrix<precision>::setValuesFromVector(const std::vector<precision> &vector)
{
ck(cudaMemcpy(data, vector.data(), vector.size() * sizeof(precision), cudaMemcpyHostToDevice));
}
/**
* Apply the function "fn" to all elements of the current matrix such as *this[i] = fn(*this[i], A[i])
*
* @tparam precision - The matrix precision
*
* @param A - The input matrix A
* @param op - The binary function to apply
*
* @return This
*/
template<typename precision>
CudaMatrix<precision> CudaMatrix<precision>::transform(const CudaMatrix &A, binaryFunction fn)
{
const uint threadsPerBlock = 128;
const uint numBlock = size() / threadsPerBlock + 1;
transformProcess<<< numBlock, threadsPerBlock >>>(*this, A, fn);
return *this;
}
// Forward template declarations
template struct CudaMatrix<double>;
template struct CudaMatrix<float>;
template struct CudaMatrix<int>;
main.cpp
#include "Types.cuh"
int main(int argc, char **argv)
{
// Allocate memory
CudaMatrix<double> m1(3, 3);
CudaMatrix<double> m2(3, 3);
ck(cudaMalloc(&m1.data, m1.bytesSize()));
ck(cudaMalloc(&m2.data, m2.bytesSize()));
// Test here
m1.setValuesFromVector({1, 1, 1, 2, 2, 2, 3, 3, 3});
m2.fill(10);
m1.display("m1");
m2.display("m2");
m1 *= m2;
m1.display("m1 * m2");
m1 += m2;
m1.display("m1 + m2");
// Clean memory
ck(cudaFree(m1.data));
ck(cudaFree(m2.data));
return EXIT_SUCCESS;
}
Output
Matrix m1 3 x 3 pixels of d
{ 1, 1, 1 }
{ 2, 2, 2 }
{ 3, 3, 3 }
Matrix m2 3 x 3 pixels of d
{ 10, 10, 10 }
{ 10, 10, 10 }
{ 10, 10, 10 }
Matrix m1 * m2 3 x 3 pixels of d
{ 1, 1, 1 }
{ 2, 2, 2 }
{ 3, 3, 3 }
Matrix m1 + m2 3 x 3 pixels of d
Segmentation fault (core dumped)
EDIT 3
Robert Crovella's solution with "nested" template strategy is working nicely.