I wanted to used 2D array on GPU as we do on CPU. Thus the below code. It executes without errors but returns some garbage values.
Could anyone please suggest me what might have went wrong...!
Thank you.
#include<stdio.h>
__global__ void add2(int** da)
{
int idx=threadIdx.x;
int idy=threadIdx.y;
da[idx][idy]+=2;
// printf("It came here");
printf("%d \n",da[idx][idy]);
}
int main()
{
int ha[2][2],**da, hb[2][2];
size_t pitch;
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
ha[i][j]=0;
}
cudaMallocPitch((void**)&da, &pitch, 2*sizeof(int),2);
cudaMemcpy2D(&da, 2*sizeof(int), ha, pitch, 2*sizeof(int), 2, cudaMemcpyHostToDevice);
printf("Before kernel\n");
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
printf("%d ",ha[i][j]);
printf("\n");
}
printf("\n");
add2<<<2,2>>>(da);
// gpuErrchk(cudaPeekAtLastError());
// gpuErrchk(cudaDeviceSynchronize());
cudaMemcpy2D(&hb, 2*sizeof(int), da, pitch, 2*sizeof(int), 2, cudaMemcpyDeviceToHost);
printf("After kernel\n");
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
printf("%d ",hb[i][j]);
printf("\n");
}
return 0;
}