-1

I have a custom matrix library based on CRTP, tested, for dynamic matrices:

#include <thrust/device_vector.h>
#include <assert.h>

namespace CML
{
template<class T, class Derived>

    template<class T, class Derived> class Matrix_Base {};
    template <class T>
    class Dynamic_Matrix : public Matrix_Base<T, Dynamic_Matrix<T>> 
    {
    private:
        size_t n_rows, n_cols;

        T* data;

        bool is_allocated;

        __host__ __device__ void allocate_data()
        {
              assert(n_rows > 0 && n_cols > 0);
              data = new T[n_rows * n_cols];
              is_allocated = true;
        }

        __host__ __device__ void deallocate_data() { delete[] data; data = nullptr; is_allocated = false; }

        __host__ __device__ void assign_data(const Dynamic_Matrix &other)
        {
             if (!other.is_allocated){ return; }

             n_rows = other.n_rows;
             n_cols = other.n_cols;

             if (!is_allocated){ allocate_data(); }
        
             printf("Dynamic matrix assign data, is_allocated? %d, is_other_allocated? %d \n", is_allocated, other.is_allocated);
             if (other.n_rows == 0 || other.n_cols == 0)
             {
                  printf("Error: n_rows == 0 or n_cols == 0! \n");
             } 
             for (size_t i = 0; i < n_rows; i++)
             {
                  for (size_t j = 0; j < n_cols; j++)
                  {
                        this->data[n_cols * i + j] = other.data[n_cols * i + j]; //<-- this line gives error
                  }
             }
        }

    public:
        
        __host__ __device__ Dynamic_Matrix() : n_rows(0), n_cols(0), data(nullptr), is_allocated(false) {}

        __host__ __device__ Dynamic_Matrix(const size_t n_rows, const size_t n_cols) :
        n_rows(n_rows), n_cols(n_cols), data(nullptr), is_allocated(false)
        {
            allocate_data();
        }

        __host__ __device__ Dynamic_Matrix(const Dynamic_Matrix &other):
        data(nullptr), is_allocated(false)
        {
            assign_data(other);
        }

        __host__ __device__ ~Dynamic_Matrix() { deallocate_data(); }

        __host__ __device__ Dynamic_Matrix& operator=(const Dynamic_Matrix &rhs)
        {
             if (this == &rhs)
             {
                 return *this;
             }
             deallocate_data(); 
             assign_data(rhs);
             return *this;
        }

        __host__ __device__ void resize(const size_t n_rows, const size_t n_cols)
        {
              assert(n_rows > 0 && n_cols > 0);
              *this = Dynamic_Matrix<T>(n_rows, n_cols);
        }
    };

    using MatrixXd = Dynamic_Matrix<double>;
};

which is used in some classes in my project, similar to this simple one:

#include <thrust/device_vector.h>

class My_Class
{
private:

    CML::MatrixXd mat1;
    CML::MatrixXd mat2;
    CML::MatrixXd mat3;
    CML::MatrixXd mat4;

public:
    __host__ __device__ My_Class() { mat1.resize(3, 1); mat2.resize(3, 1); mat3.resize(3, 1); mat4.resize(3, 1); }
};

but, when I try to run a thrust::transform with a functor (simplified here) as in

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>

class myFunctor
{
private: 
    My_Class my_class;
public: 
    __host__ __device__ myFunctor() {}

    __device__ double operator()(const unsigned int n) { double ret = 0; return ret; }
};

int main()
{
    int n_cbs = 10;
    thrust::device_vector<double> cb_costs(n_cbs);

    thrust::counting_iterator<unsigned int> index_iter(0);
    
    thrust::transform(thrust::device, index_iter, index_iter + n_cbs, cb_costs.begin(), myFunctor());
    return 0;
}

this gives me error Invalid __global__ read of size 8 ========= at 0x00001250 in /../dynamic_matrix.cuh:724:CML::Dynamic_Matrix<double>::assign_data(CML::Dynamic_Matrix<double> const &) ========= by thread (255,0,0) in block (0,0,0) ========= Address 0x55965a2ce530 is out of bounds when using cuda-memcheck on the application. The line in the error report is the this->data[n_cols * i + j] = other.data[n_cols * i + j]; line in the matrix library. Sofar as debugging has taken me, I have not found that any indices goes out of range with this. Any suggestions towards what is the problem? Commenting out the matrix objects in the constructing makes the code runable.

A_Man
  • 131
  • 7
  • Your code is ill-formed: https://en.cppreference.com/w/cpp/language/identifiers "the identifiers with a double underscore anywhere are reserved" – Bernd Sep 16 '20 at 19:49
  • Can you ellaborate? Double underscore in `__host__ __device__` is defined in e.g. – A_Man Sep 16 '20 at 20:00
  • Okay, sorry those are some CUDA / NVidia compiler extensions to run code on the GPU. But is no standard C++. The files with extensions are named ".cu". --- In general double underscore is reserved for the standard library - but your standard library... But here it seems to be fine. – Bernd Sep 16 '20 at 20:14
  • I would suggest providing a [mcve]. I tried to assemble the pieces you have provided here and could not get it to compile. – Robert Crovella Sep 16 '20 at 20:17
  • 1
    I read some CUDA docs, now. You can't use simply `new` or `malloc` for memory allocation. You need to call cudaMalloc for GPU / CPU code. – Bernd Sep 16 '20 at 20:27
  • 1
    In general I would define custom macros like MYLIB_CPU_GPU_CODE. This would allow to use your library without a CUDA compiler / CPU Only. In addition I would create my own Malloc / Free functions which call in case of CUDO cudaMalloc / cudaFree and new / delete for non Cuda Code. – Bernd Sep 16 '20 at 20:32
  • 2
    The copy constructor is being called due to the kernel launch mechanism (thrust dispatch includes the functor object as a kernel argument for the `transform` algorithm, and the copy constructor gets called due to this object) and the copy constructor calls `assign_data` which has the hazard roughly as described by Bernd. One of the two objects will have its data members instantiated on the host, the other will have its data members instantiated on the device, and the copy constructor does not account for this. – Robert Crovella Sep 16 '20 at 22:11
  • Thank you for good replies. @Bernd do you have a suggestion for how to switch between using `new/delete[]` and `cudaMalloc/cudaFree` depending on if the library is used on the gpu or not (and I will mark as answer)? I dont immediately see how this macro you describe could be used. This library I made is primarily for GPU-usage. – A_Man Sep 17 '20 at 06:10
  • 1
    You can't switch between allocation automagically. Both have to run on the host, so they are both __host__ functions and the compiler can't differentiate compilation cases by itself. There is a reason why thrust has separate device_vector and host_vector types, even though both are host side types. Probably the only solution would be template tag style specialization of the type unless you want to do what I outlined in my answer – talonmies Sep 17 '20 at 07:54

1 Answers1

2

As has been pointed out in comments, the problem is the portability of your Dynamic_Matrix class between the host and device.

In principle, the design is OK, as long as you only either instantiate and use in host memory or instantiate and use in device memory. The new operator is fine to use in both host and device code (although the device allocation goes on a runtime heap which must be a priori sized and which is inaccessible from the host APIs), but then the class instance is not portable. The data allocation is either in host memory or device memory and not accessible in the runtime space where it was not allocated. When you do this:

thrust::transform(thrust::device, index_iter, index_iter + n_cbs, cb_costs.begin(), myFunctor());

you are implicitly asking for your myFunctor instance to be copied to the device, and this is where the code breaks. The data pointers are host pointers, allocated with host new and the device code blows up with invalid pointer errors.

As a rule of thumb, anything you use with thrust should be a POD type. That means they are trivially copyable to the device without portability issues. In all likelihood you will need to refactor your code to eliminate use of new and delete and pass the pointer to the memory which will hold the class data (which you manually allocate on the device or in managed memory) to a version of the constructor for that purpose. Alternatively consider statically declaring the data array via template specialization so your class doesn't require dynamic memory allocation at all. Eigen uses this approach in its CUDA support.

Although it doesn't apply here (because thrust doesn't use pass by reference for functors), note that there is also a very elegant design pattern for using managed memory to allocate classes which can safely be passed by reference. Classes allocated in this fashion are portable to both host and device, although their ultimate performance will be inferior, because of the overhead of the managed memory they use.

talonmies
  • 70,661
  • 34
  • 192
  • 269
  • Can you point me to information on how to declare data arrays statically as Eigen does? I have looked for this, but not found anything sensible yet. – A_Man Sep 21 '20 at 06:45
  • 1
    https://eigen.tuxfamily.org/dox/group__TutorialMatrixClass.html – talonmies Sep 21 '20 at 06:52
  • Thank you, I will consider that. Related to the post: If I would use a pointer to `My_Class` in `myFunctor` instead, and use cudaMalloc and cudaFree to allocate it on the device, the code should be workable? I would then construct `myFunctor` first, where cudaMalloc is used, and then pass it to the `thrust::transform` – A_Man Sep 21 '20 at 14:14