CUDA: Allocating 1d device memory to copy 2d pointer-to-pointer host array to and from GPU

Question

I'm working on a project attempting to parallelize and speed up some statistical/numerical computation scripts designed by other people. Before this project started, I was a complete novice when it came to programming (I'm more the analytical math type), so please forgive me for any ensuing ignorance or complete misunderstanding. They're using the following function to generate matrices:

double ** CreateMatrix(int m, int n)
{
    int i;
    double **A;
    // pointer allocation to rows
    A = (double **) malloc((size_t)((m*n)*sizeof(double)));
    // allocate rows and set pointers
    A[0] = (double *) malloc((size_t)((m*n)*sizeof(double)));
    for(i=1; i<=m; i++){
        A[i]=A[i-1] + n;
    }
    // return the pointer to array of pointers to rows
    return A;
}

I'm not to keen on reworking the basic structure of their matrix objects as they've designed their entire code around it, so I've been trying to pass these structures to the GPU but as 1D linear memory as I've read allocating memory for and copying a pointer to an array of pointers is inefficient on the GPU is too inefficient. I've tried to get this most basic example working:

__global__ void MatrixMult(double *A, double *B, double *C, int N)
{
    int col = blockDim.x*blockIdx.x + threadIdx.x;
    int row = blockDim.y*blockIdx.y + threadIdx.y;

    if( col < N && row < N){
        C[col*N + row] = A[col*N + row] + B[col*N + row]; 
        //C[col][row] = B[col][row] + A[col][row];
    }

}

const int N = 5000;

int main()
{
    double **h_A,**h_B, **h_C;
    h_A = CreateMatrix(N,N);
    h_B = CreateMatrix(N,N);
    h_C = CreateMatrix(N,N);
    for(int i=0; i<N; i++){
        for(int j=0; j<N; j++){
            h_A[i][j]=1;
            h_B[i][j]=6;
            h_C[i][j]=0;
        }
    }

    size_t pitchA,pitchB,pitchC;

    double *d_A,*d_B,*d_C;

    cudaMallocPitch(&d_A, &pitchA, N*sizeof(double), N);
    cudaMallocPitch(&d_B, &pitchB, N*sizeof(double), N);
    cudaMallocPitch(&d_C, &pitchC, N*sizeof(double), N);
    cudaMemcpy2D(d_A, pitchA, h_A, N*sizeof(double), N*sizeof(double), N, cudaMemcpyHostToDevice);
    cudaMemcpy2D(d_B, pitchB, h_B, N*sizeof(double), N*sizeof(double), N, cudaMemcpyHostToDevice);
    cudaMemcpy2D(d_C, pitchC, h_C, N*sizeof(double), N*sizeof(double), N, cudaMemcpyHostToDevice);

    dim3 GridSize(250,250,1);
    dim3 BlockSize(20,20,1);

    MatrixMult<<<GridSize, BlockSize>>>(d_A,d_B,d_C,N);

    cudaMemcpy2D(h_C, N*sizeof(double), d_C,pitchC, N*sizeof(double), N, cudaMemcpyDeviceToHost);
    PrintMatrix(h_C,N,N);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

The problem is I get a segfault when I try to use the PrintMatrix function to check my the results:

void PrintMatrix(double **A, int m, int n)
{
    int i, j;
    for(i=0; i<m; i++){
        for(j=0; j<n; j++){
            cout << A[i][j] << "\t";
        }
        cout << "\n";
    }
}

I guess there's some subtle realignment of memory I'm not understanding. I guess my first question is if it's possible to pass a 2D double** object as a 1D double* to the device, do some computation, then copy it back to it's original double** format on the host? If so, can someone tell me what I'm missing?

Your usage of `cudaMemcpy2D` is not correct. I also think your `CreateMatrix` is broken, which will eventually lead to the segfault you are seeing. Are you *certain* that `CreateMatrix` is exactly what someone else is using successfully (for host code)? — Robert Crovella, Jul 09 '14 at 17:09

Robert Crovella · Accepted Answer · 2014-07-09T17:43:27.320

I believe your CreateMatrix was broken, but it was probably functional (the version I have below is slightly different than yours, although yours would probably work). However your general matrix handling between host and device was broken. cudaMemcpy2D and cudaMallocPitch are not actually for handling double pointer arrays (**) despite their names. Review the documentation.

However your CreateMatrix (suitably fixed) does allow for your code to be only slightly modified and work correctly. CreateMatrix cleverly allows for doubly-subscripted access on the host, while at the same time ensuring that the underlying data is contiguous. Therefore we can use A[0] as a pointer directly to the contiguous underlying data in A. This means we can use ordinary cudaMalloc and cudaMemcpy. Here is a fully worked example:

#include <iostream>
#define MAT_DIM 32
#define T1_VAL 1
#define T2_VAL 6

double ** CreateMatrix(int m, int n)
{
    int i;
    double **A;
    // pointer allocation to rows
    A = (double **) malloc((size_t)(m*sizeof(double *)));
    // allocate rows and set pointers
    A[0] = (double *) malloc((size_t)((m*n)*sizeof(double)));
    for(i=1; i<=m; i++){
        A[i]=A[i-1] + n;
    }
    // return the pointer to array of pointers to rows
    return A;
}

void PrintMatrix(double **A, int m, int n)
{
    int i, j;
    for(i=0; i<m; i++){
        for(j=0; j<n; j++){
            std::cout << A[i][j] << "\t";
        }
        std::cout << "\n";
    }
}

int ValidateMatrix(double **A, int m, int n)
{
    int i, j;
    for(i=0; i<m; i++)
        for(j=0; j<n; j++)
            if (A[i][j] != (T1_VAL+T2_VAL)) {printf("mismatch at %d, %d, value: %f\n", i,j,A[i][j]); return 0;}
    return 1;
}

__global__ void MatrixMult(double *A, double *B, double *C, int N)
{
    int col = blockDim.x*blockIdx.x + threadIdx.x;
    int row = blockDim.y*blockIdx.y + threadIdx.y;

    if( (col < N) && (row < N)){
        C[col*N + row] = A[col*N + row] + B[col*N + row];
        //C[col][row] = B[col][row] + A[col][row];
    }

}

const int N = MAT_DIM;

int main()
{
    double **h_A,**h_B, **h_C;
    h_A = CreateMatrix(N,N);
    h_B = CreateMatrix(N,N);
    h_C = CreateMatrix(N,N);
    for(int i=0; i<N; i++){
        for(int j=0; j<N; j++){
            h_A[i][j]=T1_VAL;
            h_B[i][j]=T2_VAL;
            h_C[i][j]=0;
        }
    }

    double *d_A,*d_B,*d_C;

    cudaMalloc(&d_A, N*N*sizeof(double));
    cudaMalloc(&d_B, N*N*sizeof(double));
    cudaMalloc(&d_C, N*N*sizeof(double));
    cudaMemcpy(d_A, h_A[0], N*N*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B[0], N*N*sizeof(double), cudaMemcpyHostToDevice);

    dim3 BlockSize(16,16);
    dim3 GridSize((N+BlockSize.x-1)/BlockSize.x,(N+BlockSize.y-1)/BlockSize.y);

    MatrixMult<<<GridSize, BlockSize>>>(d_A,d_B,d_C,N);

    cudaMemcpy(h_C[0], d_C,N*N*sizeof(double),cudaMemcpyDeviceToHost);
    //PrintMatrix(h_C,N,N);
    if (!ValidateMatrix(h_C, N, N)) printf("Failure!\n");
    else printf("Success!\n");
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

The proximal reason that your PrintMatrix was segfaulting, was that the cudaMemcpy2D operation from device to host was overwriting the pointer array that had been established to index into h_C by CreateMatrix. This is fixed by using the single pointers into the arrays as I have shown.

There is nothing wrong with your PrintMatrix and you should be able to uncomment it if you want to. I just didn't want to look at printout for large matrices.

As an aside, your MatrixMult kernel is actually adding 2 matrices. I'm sure you knew that.

CUDA: Allocating 1d device memory to copy 2d pointer-to-pointer host array to and from GPU

1 Answers1

Linked