I am trying to allocate device memory, copy to it, perform the calculations on the GPU, copy the results back and then free up the device memory I allocated. I wanted to make sure that I wasn't going over the limit and I wanted to see if I would have enough memory in the shared memory space to dump a few arrays.
When I allocate device memory, there are no errors being returned. When I use cudaMemGetInfo
to check the amount of memory allocated, it looks like one cudaMalloc
hasn't allocated any memory.
Also when I try to free the memory, it looks like only one pointer is freed.
I am using the matlab Mexfunction
interface to setup the GPU memory and launch the kernel. At this point, I'm not even calling into the kernel and just returning back a unit matrix for the results.
cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);
/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;
/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));
/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* call the kernel */
// launchKernel<<<1,512>>>(........);
/* retireve the output */
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* free the memory. */
mexPrintf("\nFree'ing memory.\n");
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);
cudaErr = cudaFree(devicePulseDelay);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceTarDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScattDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScatterers);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedReal);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedImag);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
Here is the output from this:
Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
I feel like there is something obvious that i'm missing. Can anyone help explain what is going on?
EDIT: platform is windows 7 with a Tesla C2050 GPu card.