I have a performance drop using the nppiCopyConstBorder_8u_C1R
function in two different architectures (GTX480 and GTX TITAN) involving also different CUDA version (v5.0 and v5.5 respectively).
In the first case (GTX480 and CUDA 5.0) the execution time of the function is
T = 0.00005 seconds
In the second case (GTX TITAN and CUDA 5.5) the execution time is
T = 0.969831 seconds
I have reproduced this behaviour with the following code:
// GTX480 nvcc -lnpp -m64 -O3 --ptxas-options=-v -gencode arch=compute_20,code=sm_20 --compiler-options -use_fast_math
// GTXTITAN nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_35,code=sm_35 --compiler-options -use_fast_math
#include <stdlib.h>
#include <stdio.h>
// CUDA
#include <cuda.h>
#include <cuda_runtime_api.h>
// CUDA Nvidia Performance Primitives
#include <npp.h>
#include <assert.h>
#define w 256 // width
#define h 256 // height
#define b 16 // extra border
#define BORDER_TYPE 0
int main(int argc, char *argv[])
{
// input data
Npp8u* h_idata[w*h];
// output data
Npp8u* h_odata[(w+b)*(h+b)];
/* MEMORY ALLOCTION AND INITIAL COPY OF DATA FROM CPU TO GPU */
Npp8u *i_devPtr, *i_devPtr_Border;
// size of input the data
int d_Size = w * h * sizeof(Npp8u);
// allocate input data
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr, d_Size ) );
// copy initial data to GPU
CUDA_CHECK_RETURN( cudaMemcpy(i_devPtr, h_idata, d_Size, cudaMemcpyHostToDevice) );
// size of output the data
int d_Size_o = (w+b) * (h+b) * sizeof(Npp8u);
// allocation for input data with extended border
CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr_Border, d_Size_o ) );
// create struct with ROI size given the current mask
NppiSize SizeROI = {w, h};
NppiSize SizeROI_Border = { SizeROI.width + b, SizeROI.height + b };
// create events
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
// NPP Library Copy Constant Border
cudaEventRecord( start, 0 );
NppStatus eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
b, b, BORDER_TYPE);
cudaDeviceSynchronize();
assert( NPP_NO_ERROR == eStatusNPP );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("T= %1.5f sg\n", milliseconds / 1000.0f);
// copy output data from GPU
CUDA_CHECK_RETURN( cudaMemcpy(h_odata, i_devPtr_Border, d_Size_o, cudaMemcpyDeviceToHost) );
/* free resources */
cudaFree(i_devPtr);
cudaFree(i_devPtr_Border);
CUDA_CHECK_RETURN(cudaDeviceReset());
return 0;
}
Q: Anyone is aware about this issue ?
This makes me ask the following question:
Q: How is nppiCopyConstBorder_8u_C1R
implemented? Does the function involve copy data from device to host, extend the border in the host and copy the result to the device?
PS: The machine with the TITAN has the GPU outside the box in a separated motherboard specially designed for multiple PCIe connections and it's connected via a PCIe wire. I have not seen any drawback in this configuration regarding other kernels I have tested.