0

I'm trying to write a simple array sum between two dynamic arrays using double pointers, both for host and device arrays. The following code can be compiled normally, but at the end the array h_c, containing the result of the sum, is full of zeros. I think that the device arrays are bad allocated and the host arrays aren't transferred properly. Can someone fix the problem? Thanks.

#include <stdio.h>
#include <cuda_runtime.h>
#define N 16
#define BLOCK_DIM 4

__global__ void matrixAdd (int **a, int **b, int **c) {

int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;

if (col < N && row < N) 
c[row][col] = a[row][col] + b[row][col];}

int** create(int row, int col){
  int i,j;
  int** temp;

  temp=(int**) malloc(row*sizeof(int*));
  for(i=0;i<row;i++)
    temp[i]=(int*) malloc(row*sizeof(int));

  for(i=0;i<row;i++)
    for(j=0;j<col;j++)
    temp[i][j]=0;

 return(temp);}

void destroy(int **temp,int rows){
  int i;

  for(i=0;i<rows;i++)
    free(temp[i]);
  free(temp);
 }

int main() {
int i,j;
int** h_a=create(N,N);
int** h_b=create(N,N);
int** h_c=create(N,N);
int **dev_a, **dev_b, **dev_c;
int size = N * N * sizeof(int);

cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);

for(i=0;i<N;i++)  
   for(j=0;j<N;j++)
    h_a[i][j]=5;

for(i=0;i<N;i++)  
   for(j=0;j<N;j++)
    h_b[i][j]=15;  


cudaMemcpy(dev_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, h_b, size, cudaMemcpyHostToDevice);

dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));

matrixAdd<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c);
cudaMemcpy(h_c,dev_c, size, cudaMemcpyDeviceToHost);

for(i=0;i<N;i++)
{  
   for(j=0;j<N;j++)
    printf("%d ",h_c[i][j]);
   printf("\n");
}

cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);

destroy(h_a,N); destroy(h_b,N); destroy(h_c,N);

return 0;}
talonmies
  • 70,661
  • 34
  • 192
  • 269
horus
  • 91
  • 1
  • 9
  • 1
    hover your mouse over the cuda tag. When the pop up appears, click on "info". In the page that opens up, click on "Using Arrays of pointers in CUDA". Start reading. Using dynamically allocated 2D arrays is non-trivial, and you have two errors. 1. You must properly allocate the device array so that the pointers can be chased. 2. Your host array cannot be conveniently allocated with a bunch of `malloc` operations like you are doing. You need a guaranteed contiguous array on the host, if you dont want to have to use a loop to transfer data from host to device and back. – Robert Crovella Mar 09 '17 at 14:52
  • Thank you talonmies. Yes, my question is similar to the one you cited. The following code works! But I'm not able to understand why it is necessary to define h_a and a casting to (void***) for d&a. Can someone explain me it? Thanks. – horus Mar 15 '17 at 14:06
  • `int** aa=create(2,2);` `aa[0][0]=1; aa[0][1]=2;aa[1][0]=3;aa[1][1]=4;` `int** h_a = (int**)malloc(2 * sizeof(int*));` `for(int i=0; i<2;i++){` `cudaMalloc((void**)&h_a[i], 2*sizeof(int));` `cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice);` `}` `int** d_a;` `cudaMalloc((void ***)&d_a, 2 * sizeof(int*));` `cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);` – horus Mar 15 '17 at 14:12

0 Answers0