I am learning CUDA. I am having difficulty indexing dynamically sized arrays that I have transferred to the device.
As a toy problem I am trying to sum two matrices on interior points only. I have found that if I use static arrays the result is as expected but in practice my data will be too big to fit on the stack so it must be dynamically sized arrays.
I think the reason static arrays work as expected is because the data is tight or packed, no gaps between adjacent elements in memory. But the dynamic array is disjointed and points to various places in memory for each 'row' of the matrix. This is reflected in the result below where the elements are not where I'd expect them to be.
I have found several examples online that show how to index 2D arrays on the device but they all use static arrays as examples.
I'd appreciate any advice on how to approach this.
My idea right now is to flatten the dynamically sized matrices on the host first (so convert them to double *
of size Nx * Ny * sizeof(double)
) and then copy that to the device? But if I do that I'm not sure I can use the same indexing scheme I have shown below (and which I'd prefer to keep if possible)?
Dynamic Array Result
2.000000 2.000000 2.000000 0.000000 0.000000 2.000000
2.000000 0.000000 0.000000 2.000000 2.000000 2.000000
0.000000 2.000000 2.000000 2.000000 2.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Static Array Result
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.000000 2.000000 2.000000 2.000000 2.000000 0.000000
0.000000 2.000000 2.000000 2.000000 2.000000 0.000000
0.000000 2.000000 2.000000 2.000000 2.000000 0.000000
0.000000 2.000000 2.000000 2.000000 2.000000 0.000000
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
MWE
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 2
__global__ void matrix_add(double *C, double *A, double *B, int Nx, int Ny);
double **create_matrix(int row, int col);
int main(void)
{
int Nx = 6;
int Ny = 6;
// Create dynamically sized arrays on host
double **A = create_matrix(Nx, Ny);
double **B = create_matrix(Nx, Ny);
double **C = create_matrix(Nx, Ny);
// Fill arrays
for (int i = 0; i < Nx; i++)
{
for (int j = 0; j < Ny; j++)
{
A[i][j] = 1.0;
B[i][j] = 1.0;
C[i][j] = 0.0;
}
}
// Next create arrays for device
double *d_A, *d_B, *d_C;
// Allocate memory on GPU
int size = Nx * Ny * sizeof(double);
cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);
// Copy host arrays over to device arrays.
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, size, cudaMemcpyHostToDevice);
dim3 dimBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
int blockdimX = (int)ceil(Nx / dimBlock.x);
int blockdimY = (int)ceil(Ny / dimBlock.y);
dim3 dimGrid(blockdimX, blockdimY);
matrix_add<<<dimGrid, dimBlock>>>(d_C, d_A, d_B, Nx, Ny);
// Copy back to CPU
cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
// inspect result
for (int i = 0; i < Nx; i++)
{
for (int j = 0; j < Ny; j++)
{
printf("%f ", C[i][j]);
}
printf("\n");
}
return EXIT_SUCCESS;
}
__global__ void matrix_add(double *C, double *A, double *B, int Nx, int Ny)
{
int col = threadIdx.x + blockIdx.x * blockDim.x;
int row = threadIdx.y + blockIdx.y * blockDim.y;
int index = col + row * Nx;
if (row != 0 && col != 0 && row < (Ny - 1) && col < (Nx - 1))
{
C[index] = A[index] + B[index];
}
}
double **create_matrix(int row, int col)
{
double **mat;
mat = (double **)calloc(row, sizeof(double *));
for (int i = 0; i < row; i++)
{
mat[i] = (double *)calloc(col, sizeof(double));
}
return mat;
}