In a CPU version of some Code, I have many things that look like the following:
for(int i =0;i<N;i++){
dgemm(A[i], B[i],C[i], Size[i][0], Size[i][1], Size[i][2], Size[i][3], 'N','T');
}
Where A[i]
will be a 2D matrix of some size.
I would like to be able to do this on a GPU using CULA (I'm not just doing multiplies, so I need the Linear ALgebra operations in CULA), so for example:
for(int i =0;i<N;i++){
status = culaDeviceDgemm('T', 'N', Size[i][0], Size[i][0], Size[i][0], alpha, GlobalMat_d[i], Size[i][0], NG_d[i], Size[i][0], beta, GG_d[i], Size[i][0]);
}
However, I would like to store my B's on the GPU in advance at the start of the program as they don't change, but I have no idea how to go about doing that. Or how I could store my arrays in general so that this is possible.
I've seen various things online about using 3D matrices with CUDA, but they don't seem very applicable to being able to then make a function call to the CULA functions.
From the example in the answer below I have this:
extern "C" void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff){
cudaError_t err;
err = cudaMalloc( (void ***)&GlobalFVecs_d, numpulsars*sizeof(double*) );
checkCudaError(err);
for(int i =0; i < numpulsars;i++){
err = cudaMalloc( (void **) &(GlobalFVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
checkCudaError(err);
// err = cudaMemcpy( GlobalFVecs_d[i], FNFVecs[i], sizeof(double)*numcoeff*numcoeff, cudaMemcpyHostToDevice );
// checkCudaError(err);
}
}
Where I have declared double **GlobalFVecs_d to be a global. But I get a seg fault when it hits the line
err = cudaMalloc( (void **) &(GlobalFVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
Yet it seems to be exactly what is in the other example?
I realised it wasn't the same, so I now have code that compiles, with:
double **GlobalFVecs_d;
double **GlobalFPVecs_d;
extern "C" void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff){
cudaError_t err;
GlobalFPVecs_d = (double **)malloc(numpulsars * sizeof(double*));
err = cudaMalloc( (void ***)&GlobalFVecs_d, numpulsars*sizeof(double*) );
checkCudaError(err);
for(int i =0; i < numpulsars;i++){
err = cudaMalloc( (void **) &(GlobalFPVecs_d[i]), numcoeff*numcoeff*sizeof(double) );
checkCudaError(err);
err = cudaMemcpy( GlobalFPVecs_d[i], FNFVecs[i], sizeof(double)*numcoeff*numcoeff, cudaMemcpyHostToDevice );
checkCudaError(err);
}
err = cudaMemcpy( GlobalFVecs_d, GlobalFPVecs_d, sizeof(double*)*numpulsars, cudaMemcpyHostToDevice );
checkCudaError(err);
}
However, if I now try and access it with:
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid;//((G + dimBlock.x - 1) / dimBlock.x,(N + dimBlock.y - 1) / dimBlock.y);
dimGrid.x=(numcoeff + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (numcoeff + dimBlock.y - 1)/dimBlock.y;
for(int i =0; i < numpulsars; i++){
CopyPPFNF<<<dimGrid, dimBlock>>>(PPFMVec_d, GlobalFVecs_d[i], numpulsars, numcoeff, i);
}
It seg faults here instead, is this not how to get at the data?