I have a custom matrix library based on CRTP, tested, for dynamic matrices:
#include <thrust/device_vector.h>
#include <assert.h>
namespace CML
{
template<class T, class Derived>
template<class T, class Derived> class Matrix_Base {};
template <class T>
class Dynamic_Matrix : public Matrix_Base<T, Dynamic_Matrix<T>>
{
private:
size_t n_rows, n_cols;
T* data;
bool is_allocated;
__host__ __device__ void allocate_data()
{
assert(n_rows > 0 && n_cols > 0);
data = new T[n_rows * n_cols];
is_allocated = true;
}
__host__ __device__ void deallocate_data() { delete[] data; data = nullptr; is_allocated = false; }
__host__ __device__ void assign_data(const Dynamic_Matrix &other)
{
if (!other.is_allocated){ return; }
n_rows = other.n_rows;
n_cols = other.n_cols;
if (!is_allocated){ allocate_data(); }
printf("Dynamic matrix assign data, is_allocated? %d, is_other_allocated? %d \n", is_allocated, other.is_allocated);
if (other.n_rows == 0 || other.n_cols == 0)
{
printf("Error: n_rows == 0 or n_cols == 0! \n");
}
for (size_t i = 0; i < n_rows; i++)
{
for (size_t j = 0; j < n_cols; j++)
{
this->data[n_cols * i + j] = other.data[n_cols * i + j]; //<-- this line gives error
}
}
}
public:
__host__ __device__ Dynamic_Matrix() : n_rows(0), n_cols(0), data(nullptr), is_allocated(false) {}
__host__ __device__ Dynamic_Matrix(const size_t n_rows, const size_t n_cols) :
n_rows(n_rows), n_cols(n_cols), data(nullptr), is_allocated(false)
{
allocate_data();
}
__host__ __device__ Dynamic_Matrix(const Dynamic_Matrix &other):
data(nullptr), is_allocated(false)
{
assign_data(other);
}
__host__ __device__ ~Dynamic_Matrix() { deallocate_data(); }
__host__ __device__ Dynamic_Matrix& operator=(const Dynamic_Matrix &rhs)
{
if (this == &rhs)
{
return *this;
}
deallocate_data();
assign_data(rhs);
return *this;
}
__host__ __device__ void resize(const size_t n_rows, const size_t n_cols)
{
assert(n_rows > 0 && n_cols > 0);
*this = Dynamic_Matrix<T>(n_rows, n_cols);
}
};
using MatrixXd = Dynamic_Matrix<double>;
};
which is used in some classes in my project, similar to this simple one:
#include <thrust/device_vector.h>
class My_Class
{
private:
CML::MatrixXd mat1;
CML::MatrixXd mat2;
CML::MatrixXd mat3;
CML::MatrixXd mat4;
public:
__host__ __device__ My_Class() { mat1.resize(3, 1); mat2.resize(3, 1); mat3.resize(3, 1); mat4.resize(3, 1); }
};
but, when I try to run a thrust::transform with a functor (simplified here) as in
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
class myFunctor
{
private:
My_Class my_class;
public:
__host__ __device__ myFunctor() {}
__device__ double operator()(const unsigned int n) { double ret = 0; return ret; }
};
int main()
{
int n_cbs = 10;
thrust::device_vector<double> cb_costs(n_cbs);
thrust::counting_iterator<unsigned int> index_iter(0);
thrust::transform(thrust::device, index_iter, index_iter + n_cbs, cb_costs.begin(), myFunctor());
return 0;
}
this gives me error Invalid __global__ read of size 8 ========= at 0x00001250 in /../dynamic_matrix.cuh:724:CML::Dynamic_Matrix<double>::assign_data(CML::Dynamic_Matrix<double> const &) ========= by thread (255,0,0) in block (0,0,0) ========= Address 0x55965a2ce530 is out of bounds
when using cuda-memcheck on the application. The line in the error report is the this->data[n_cols * i + j] = other.data[n_cols * i + j];
line in the matrix library. Sofar as debugging has taken me, I have not found that any indices goes out of range with this. Any suggestions towards what is the problem? Commenting out the matrix objects in the constructing makes the code runable.