When we have a linear array d_A
in CUDA, one can reset all of its values to 0 by simply
cudaMemset(d_A, 0, K*K*sizeof(unsigned int) )
and this works very fast (I suppose at the rate of the global memory bandwidth). If we now have a CUDA array cu_A
that is needed for texture/surface memory, I could not find an equivalent function to reset its values, so I have tried two workarounds: 1) store a linear array d_A
full of zeros and copy it to the CUDA array:
cudaMemcpyToArray(cu_A, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);
I found that the speed of this copy is about 10% of my global memory bandwidth, so a bit underwhelming. Then I tried option 2), where I store another CUDA array cu_B
which has pre-copied zeros in it, and then copy that to the main CUDA array. Here is the minimal working example:
#include "mex.h"
#include "gpu/mxGPUArray.h"
#define K 4096 // data dimension
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
mxInitGPU();
// Declare the density field
mwSize const Asize[] = { K, K };
mxGPUArray *A = mxGPUCreateGPUArray(2, Asize, mxUINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); // initialized to zeros
unsigned int *d_A = (unsigned int *)(mxGPUGetData(A));
// Allocate CUDA arrays in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cu_A;
cudaArray* cu_B;
cudaMallocArray(&cu_A, &channelDesc, K, K, cudaArraySurfaceLoadStore);
cudaMallocArray(&cu_B, &channelDesc, K, K, cudaArraySurfaceLoadStore);
/* Store the blank CUDA array here */
cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int timeStep = 0; timeStep<1000; timeStep++) {
cudaMemcpyArrayToArray ( cu_A, 0, 0, cu_B, 0, 0, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice ); // Reset the working memory
}
mxGPUDestroyGPUArray(A);
cudaFreeArray(cu_A);
cudaFreeArray(cu_B);
}
To my dismay, this Array-to-Array copy is running at a rate of merely
(4096*4096 elements)*(1000 iterations)*(4 bits)/(measured 9.6 s) = 7 Gb/s
out of 288 Gb/s that my Quadro P5000 should be capable of.
Do these figures make sense and is there a faster way to reset a CUDA array?