0

I have a 2d type of problem that I solved implementing a kernel. Since the problem is 2d it would be a lot more readable in the kernel if I could do d_A[i][j] instead of d_A[i + m*j] using a column-major matrix ordering. The access d_A[i][j] won't work if I just use cudaMalloc. What function do I have to use for this? I would most appreciate an example. In C++ this is trivially achieved by allocating 2d memory e.g. double** A = new double[10][10];

Is there any relationship to cudaMallocPitch? or the pitch version is only for maximizing 2d alignment and coalesced memory accesses?

SkyWalker
  • 13,729
  • 18
  • 91
  • 187
  • 1
    It's probably easiest just to use the computed index. Creating a proper dynamically allocated variable dimension 2D array is complicated. If you know the dimensions at compile time, you can use the compiler to help make it manageable [(3D example)](http://stackoverflow.com/questions/12924155/sending-3d-array-to-cuda-kernel/12925014#12925014). `cudaMallocPitch` still only handles 1D allocations, but creates pitched storage for maximizing row alignment. – Robert Crovella Jan 14 '14 at 00:06

2 Answers2

3

You can first define a vector class with stripe supported, then the operator[] of the 2d matrix can return a vector with stripe properly set. The second [] will actually be called from the vector. Here is an example:

#define _devhost_ __device__ __host__
typedef long SizeT;

template<typename T>
_devhost_ const T* pointer_offset(const T* ptr, SizeT offset) {
  return reinterpret_cast<const T*>(
      reinterpret_cast<const uint8_t*>(ptr) + offset);
}

typedef enum {
  NonConst = 0,
  Const = 1,
} ConstEnum;

typedef enum {
  NonOwner = 0,
  Owner = 1,
} OwnerEnum;

// Strip is measured in the number of bytes.
typedef enum {
  NonStrip = 0,
  Strip = 1,
} StripEnum;

template<
  typename ValueType, typename Alloc,
  ConstEnum IsConst = NonConst,
  OwnerEnum IsOwner = NonOwner,
  StripEnum HasStrip = NonStrip
> class Vector;

template<
  typename ValueType, typename Alloc,
  ConstEnum IsConst = NonConst,
  OwnerEnum IsOwner = NonOwner
> class DenseMatrix;

template<typename ValueType, typename Alloc>
class Vector<ValueType, Alloc, Const> {
protected:
  ValueType* ptr_;
  SizeT len_;

public:
  _devhost_ Vector():ptr_(0), len_(0) {}
  _devhost_ Vector(const ValueType* ptr, SizeT len) {
    ptr_ = const_cast<ValueType*>(ptr);
    len_ = len;
  }

  _devhost_ const ValueType& operator[] (SizeT i) const {
    return ptr_[i];
  }
  _devhost_ SizeT size() const {return len_;}
  _devhost_ const ValueType* data() const {return ptr_;}
};

template<typename ValueType, typename Alloc>
class Vector<ValueType, Alloc, Const, NonOwner, Strip>:
  public Vector<ValueType, Alloc, Const> {

protected:
  SizeT strip_;
  typedef Vector<ValueType, Alloc, Const> Base;

  // C++ independent names lookup will not look into base classes which
  // are depended on template arguments. A "using" is required here.
  using Base::ptr_;
  using Base::len_;

public:
  _devhost_ Vector():strip_(sizeof(ValueType)) {}
  _devhost_ Vector(const ValueType* ptr, SizeT len,
      SizeT strip = sizeof(ValueType)):Base(ptr, len), strip_(strip) {}

  _devhost_ const ValueType& operator[] (SizeT i) const {
    return *pointer_offset(ptr_, i * strip_);
  }

  // NOTE: size() and data() still valid,
  // but may not make the right sense here in the presence of stripe.
};

template<typename ValueType, typename Alloc>
class DenseMatrix<ValueType, Alloc, Const> {
protected:
  ValueType* vals_;
  SizeT nrows_, ncols_;

public:
  _devhost_ DenseMatrix() {vals_ = 0; nrows_ = 0; ncols_ = 0;}
  _devhost_ DenseMatrix(const ValueType* vals, SizeT n_rows, SizeT n_cols) {
    nrows_ = n_rows; ncols_ = n_cols;
    vals_ = const_cast<ValueType*>(vals_);
  }

  _devhost_ SizeT num_rows() const {return nrows_;}
  _devhost_ SizeT num_cols() const {return ncols_;}
  _devhost_ SizeT numel() const {return nrows_ * ncols_;}

  _devhost_ const ValueType* data() const {return vals_;}
  _devhost_ const ValueType& at(SizeT irow, SizeT icol) const {
    return vals_[irow + icol * nrows_];
  }

  typedef Vector<ValueType, Alloc, Const, NonOwner, Strip> ConstIndexer;

  _devhost_ ConstIndexer operator[] (SizeT irow) const {
    return ConstIndexer(vals_ + irow, ncols_, nrows_ * sizeof(ValueType));
  }

  _devhost_ DenseMatrix<ValueType, Alloc, Const> get_cols(SizeT icol,
      SizeT n_cols) const {
    return DenseMatrix<ValueType, Alloc, Const>(vals_ + icol * nrows_,
        nrows_, n_cols);
  }

  _devhost_ Vector<ValueType, Alloc, Const> get_col(SizeT icol) const {
    return Vector<ValueType, Alloc, Const>(vals_ + icol * nrows_, nrows_);
  }
};
shaoyl85
  • 1,854
  • 18
  • 30
1

I would simply use a define statement, if your concern is readability

#define A(i,j) d_A[i + m*j]
warunapww
  • 966
  • 4
  • 18
  • 38