2D array in the Kernel CUDA

Question

I wanted to used 2D array on GPU as we do on CPU. Thus the below code. It executes without errors but returns some garbage values.

Could anyone please suggest me what might have went wrong...!

Thank you.

#include<stdio.h>

__global__ void add2(int** da)
{
 int idx=threadIdx.x;
 int idy=threadIdx.y;

da[idx][idy]+=2;
// printf("It came here");
printf("%d \n",da[idx][idy]);
}

int main()
{
 int ha[2][2],**da, hb[2][2];
 size_t pitch;
 for(int i=0;i<2;i++)
    {
    for(int j=0;j<2;j++)
       ha[i][j]=0;
    }

 cudaMallocPitch((void**)&da, &pitch, 2*sizeof(int),2);
 cudaMemcpy2D(&da, 2*sizeof(int), ha, pitch, 2*sizeof(int), 2, cudaMemcpyHostToDevice);

 printf("Before kernel\n");
 for(int i=0;i<2;i++)
    {
     for(int j=0;j<2;j++)
     printf("%d ",ha[i][j]);
     printf("\n");
    }
 printf("\n");

 add2<<<2,2>>>(da);
 // gpuErrchk(cudaPeekAtLastError());
 // gpuErrchk(cudaDeviceSynchronize());

 cudaMemcpy2D(&hb, 2*sizeof(int), da, pitch, 2*sizeof(int), 2, cudaMemcpyDeviceToHost);

 printf("After kernel\n");
 for(int i=0;i<2;i++)
    {
     for(int j=0;j<2;j++)
    printf("%d ",hb[i][j]);
    printf("\n");
    }
  return 0;
}

Have a look at [this](http://stackoverflow.com/a/9974989/2386951) post. You need to revise your program thoroughly. — Farzad, Oct 22 '13 at 17:34
possible duplicate of [2d char array to CUDA kernel](http://stackoverflow.com/questions/19459788/2d-char-array-to-cuda-kernel) — Robert Crovella, Oct 23 '13 at 02:00
@RobertCrovella, I don't want to use the flattened 2D array. Is there a way how can the 2D array be accessed as we do usually on cpu..! I know that the memory is linear and this '[][]' version is just for visualization. Is there a way I can do it ...? — user86927, Oct 23 '13 at 07:33
Yes, there's a way you can do it. If you refer to the linked question that I marked as a duplicate, it links to an answer given by @talonmies that shows one possible way to do it. Or just put "cuda 2D array" in the search box in the upper right corner and you'll get a variety of questions where it's discussed. — Robert Crovella, Oct 23 '13 at 17:24
@RobertCrovella, thank you. I saw the post by talonmies. He uses loops in the kernel, doesn't it the sequential version of performing the operations but just in the kernel...? And he also mentioned that, we need nested 'CudaMalloc': The below is for 2x2 matrix (from his post) `cudaMalloc((void**)&h_a[0], 2*sizeof(int));` `cudaMalloc((void**)&h_a[1], 2*sizeof(int));` So, it means, for a 100x100 matrix, do I need to write these statements 100 times...!! PS: I couldn't comment in the post (for which I've no idea), that's why I am discussing here. — user86927, Oct 24 '13 at 03:51
No you don't need to write the statements 100 times. You extend the loops he indicated in his answer and you extend the size of the pointer arrays. Take note of the number `2`, both in the loops and in the specific lines of code you excerpted. — Robert Crovella, Oct 26 '13 at 15:15

score 3 · Accepted Answer · answered Oct 23 '13 at 03:46

One of the other approaches to 2D arrays is, if you think its nothing but the arrangement of the elements in memory. The following code explains you such an approach with row-major order and more generalised way to write with proper error checking.

 #include<stdio.h>

 #define NUM_ROWS 2
 #define NUM_COLS 2

 __global__ void add2(int* da, int iNumCol)
{
    int idx=threadIdx.x;
    int idy=threadIdx.y;

    da[(idx * iNumCol) + idy]+=2;
    // printf("It came here");
    //printf("%d \n",da[idx][idy]);
}

int main()
{
    int             ha[NUM_ROWS][NUM_COLS] ;
    int             *da ;
    int             hb[NUM_ROWS][NUM_COLS] ;
    int             iSize = NUM_ROWS * NUM_COLS * sizeof(int) ;
    cudaError_t     cuError = cudaSuccess ;
    dim3            dimGrid (1,1,1) ;
    dim3            dimBlock (NUM_ROWS, NUM_COLS, 1) ;

    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    ha[i][j]=0;
            }
    }

    cuError = cudaMalloc((void**)&da, iSize) ;
    if (cudaSuccess != cuError)
    {
            printf ("Failed to allocate memory\n") ;
            return 1 ;
    }
    cuError = cudaMemcpy(da, ha, iSize, cudaMemcpyHostToDevice);
    if (cudaSuccess != cuError)
    {
            cudaFree (da) ;
            printf ("Failed in Memcpy 1\n") ;
            return 1 ;
    }

    printf("Before kernel\n");
    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    printf("%d ",ha[i][j]);
            }
            printf("\n");
    }
    printf("\n");

    add2<<<dimGrid, dimBlock>>>(da, NUM_COLS);
    cuError = cudaGetLastError () ;
    if (cudaSuccess != cuError)
    {
            printf ("Failed in kernel launch and reason is %s\n", cudaGetErrorString(cuError)) ;
            return 1 ;
    }

    cuError = cudaMemcpy(hb, da, iSize, cudaMemcpyDeviceToHost);
    if (cudaSuccess != cuError)
    {
            cudaFree (da) ;
            printf ("Failed in Memcpy 2\n") ;
            return 1 ;
    }

    printf("After kernel\n");
    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    printf("%d ",hb[i][j]);
            }
            printf("\n");
    }
    cudaFree (da) ;

    return 0;
}

Thank you for your reply. I don't want to use the flattened/linear version of 2D array. Could there be a way to operate on 2D arrays in the kernel as we do usually...! Because, I want to implement the parallel computing on finite difference technique and the array dimension would be large. — user86927, Oct 23 '13 at 07:24
Sorry not very sure but the cudaMallocPitch () says you to use the following. `Given the row and column of an array element of type T, the address is computed as: T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;` So I doubt it. May be someone can give you definitive answer. — Sagar Masuti, Oct 23 '13 at 08:43
If you try to print the value of `pitch` its like 512 on my system. So if you consider that and the way the address is calculated it suggests that 8 bytes (2 * sizeof (int)) are situated at 0x00000000 then the next 8 bytes (2 * sizeof (int)) are situated at 0x00000200 (512 bytes away). Hence the addressing da[][] cant be done. Thats my understanding. — Sagar Masuti, Oct 23 '13 at 08:53

2D array in the Kernel CUDA

1 Answers1