15

How to allocate a 2D array of size MXN? And how to traverse that array in CUDA?

__global__ void test(int A[BLOCK_SIZE][BLOCK_SIZE], int B[BLOCK_SIZE][BLOCK_SIZE],int C[BLOCK_SIZE][BLOCK_SIZE])
{

    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (i < BLOCK_SIZE && j < BLOCK_SIZE)
        C[i][j] = A[i][j] + B[i][j];

}

int main()
{

    int d_A[BLOCK_SIZE][BLOCK_SIZE];
    int d_B[BLOCK_SIZE][BLOCK_SIZE];
    int d_C[BLOCK_SIZE][BLOCK_SIZE];

    int C[BLOCK_SIZE][BLOCK_SIZE];

    for(int i=0;i<BLOCK_SIZE;i++)
      for(int j=0;j<BLOCK_SIZE;j++)
      {
        d_A[i][j]=i+j;
        d_B[i][j]=i+j;
      }
    

    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 
    dim3 dimGrid(GRID_SIZE, GRID_SIZE); 

    test<<<dimGrid, dimBlock>>>(d_A,d_B,d_C); 

    cudaMemcpy(C,d_C,BLOCK_SIZE*BLOCK_SIZE , cudaMemcpyDeviceToHost);

    for(int i=0;i<BLOCK_SIZE;i++)
      for(int j=0;j<BLOCK_SIZE;j++)
      {
        printf("%d\n",C[i][j]);
    
      }
}
Adriaan
  • 17,741
  • 7
  • 42
  • 75
Sandeep
  • 663
  • 2
  • 8
  • 18
  • 1
    You cant take back the value of 2D array with cudaMemcpy, instead you have to use cudaMallocPitch or cudaPitchPtr with cudaMalloc3D as @Dave said – ardiyu07 Feb 17 '11 at 17:26

2 Answers2

20

How to allocate 2D array:

int main() {
    #define BLOCK_SIZE 16
    #define GRID_SIZE 1
    int d_A[BLOCK_SIZE][BLOCK_SIZE];
    int d_B[BLOCK_SIZE][BLOCK_SIZE];

    /* d_A initialization */

    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // so your threads are BLOCK_SIZE*BLOCK_SIZE, 256 in this case
    dim3 dimGrid(GRID_SIZE, GRID_SIZE); // 1*1 blocks in a grid
    
    YourKernel<<<dimGrid, dimBlock>>>(d_A,d_B); //Kernel invocation
}

How to traverse that array:

__global__ void YourKernel(int d_A[BLOCK_SIZE][BLOCK_SIZE], int d_B[BLOCK_SIZE][BLOCK_SIZE]){
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= h || col >= w) return;
    /* whatever you wanna do with d_A[][] and d_B[][] */
}

I hope this is helpful, and also you can refer to CUDA Programming Guide about Matrix Multiplication

eco-model
  • 13
  • 2
ardiyu07
  • 1,790
  • 2
  • 17
  • 29
  • 3
    @user621508 while this will work, it just creates one huge linear array in device memory. You also can use [cudaMalloc3D](http://developer.download.nvidia.com/compute/cuda/3_2/toolkit/docs/online/group__CUDART__MEMORY_g04a7553c90322aef32f8544d5c356a10.html#g04a7553c90322aef32f8544d5c356a10) to allocate two-dimensional arrays that are optimized for 2D-data access. I didn't know whether you just wanted the indexing of a 2D-array or the performance. – Dave O. Feb 17 '11 at 15:37
  • 2
    @username_4567, that's what /* d_A initialization */ stands for. However memory freeing is absent. – Denys S. Sep 21 '13 at 20:07
  • 7
    the actual content of /* d_A initialization */ is also an important part of the answer. Can you provide it as well? – JRsz Nov 20 '16 at 17:09
  • Cuda kernels do not use return –  Mar 07 '22 at 13:11
12

The best way would be storing a two-dimensional array A in its vector form. For example you have a matrix A size nxm, and it's (i,j) element in pointer to pointer representation will be

A[i][j] (with i=0..n-1 and j=0..m-1). 

In a vector form you can write

A[i*n+j] (with i=0..n-1 and j=0..m-1).

Using one-dimensional array in this case will simplify the copy process, which would be simple:

double *A,*dev_A; //A-hous pointer, dev_A - device pointer;
A=(double*)malloc(n*m*sizeof(double));
cudaMalloc((void**)&dev_A,n*m*sizeof(double));
cudaMemcpy(&dev_A,&A,n*m*sizeof(double),cudaMemcpyHostToDevice); //In case if A is double
Suren
  • 123
  • 1
  • 4