I'm working on a project attempting to parallelize and speed up some statistical/numerical computation scripts designed by other people. Before this project started, I was a complete novice when it came to programming (I'm more the analytical math type), so please forgive me for any ensuing ignorance or complete misunderstanding. They're using the following function to generate matrices:
double ** CreateMatrix(int m, int n)
{
int i;
double **A;
// pointer allocation to rows
A = (double **) malloc((size_t)((m*n)*sizeof(double)));
// allocate rows and set pointers
A[0] = (double *) malloc((size_t)((m*n)*sizeof(double)));
for(i=1; i<=m; i++){
A[i]=A[i-1] + n;
}
// return the pointer to array of pointers to rows
return A;
}
I'm not to keen on reworking the basic structure of their matrix objects as they've designed their entire code around it, so I've been trying to pass these structures to the GPU but as 1D linear memory as I've read allocating memory for and copying a pointer to an array of pointers is inefficient on the GPU is too inefficient. I've tried to get this most basic example working:
__global__ void MatrixMult(double *A, double *B, double *C, int N)
{
int col = blockDim.x*blockIdx.x + threadIdx.x;
int row = blockDim.y*blockIdx.y + threadIdx.y;
if( col < N && row < N){
C[col*N + row] = A[col*N + row] + B[col*N + row];
//C[col][row] = B[col][row] + A[col][row];
}
}
const int N = 5000;
int main()
{
double **h_A,**h_B, **h_C;
h_A = CreateMatrix(N,N);
h_B = CreateMatrix(N,N);
h_C = CreateMatrix(N,N);
for(int i=0; i<N; i++){
for(int j=0; j<N; j++){
h_A[i][j]=1;
h_B[i][j]=6;
h_C[i][j]=0;
}
}
size_t pitchA,pitchB,pitchC;
double *d_A,*d_B,*d_C;
cudaMallocPitch(&d_A, &pitchA, N*sizeof(double), N);
cudaMallocPitch(&d_B, &pitchB, N*sizeof(double), N);
cudaMallocPitch(&d_C, &pitchC, N*sizeof(double), N);
cudaMemcpy2D(d_A, pitchA, h_A, N*sizeof(double), N*sizeof(double), N, cudaMemcpyHostToDevice);
cudaMemcpy2D(d_B, pitchB, h_B, N*sizeof(double), N*sizeof(double), N, cudaMemcpyHostToDevice);
cudaMemcpy2D(d_C, pitchC, h_C, N*sizeof(double), N*sizeof(double), N, cudaMemcpyHostToDevice);
dim3 GridSize(250,250,1);
dim3 BlockSize(20,20,1);
MatrixMult<<<GridSize, BlockSize>>>(d_A,d_B,d_C,N);
cudaMemcpy2D(h_C, N*sizeof(double), d_C,pitchC, N*sizeof(double), N, cudaMemcpyDeviceToHost);
PrintMatrix(h_C,N,N);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
The problem is I get a segfault when I try to use the PrintMatrix function to check my the results:
void PrintMatrix(double **A, int m, int n)
{
int i, j;
for(i=0; i<m; i++){
for(j=0; j<n; j++){
cout << A[i][j] << "\t";
}
cout << "\n";
}
}
I guess there's some subtle realignment of memory I'm not understanding. I guess my first question is if it's possible to pass a 2D double**
object as a 1D double*
to the device, do some computation, then copy it back to it's original double**
format on the host? If so, can someone tell me what I'm missing?