Here are the results of some tests I did with resizing a random image using OpenCV functions 10,000 times. The best solution seems to be to convert to grayscale (if possible) before resizing, using ROI or rolling your own ASM AVX function to resize using every 1/3rd (or what ever scaling factor you need) row & column. The resize functions are fairly optimised.
Colour
INTER_LINEAR 7953.89ms
INTER_LINEAR GPU 2252.72ms
INTER_LINEAR GPU MEMIO 23303.7ms
INTER_NEAREST 7297.58ms
INTER_NEAREST GPU 906.336ms
INTER_NEAREST GPU MEMIO 22374.1ms
BORDER_DEFAULT 47488.8ms
BORDER_REFLECT 47515.4ms
BORDER_REPLICATE 47516ms
BORDER_WRAP 47980.7ms
PYR GPU 4126.93ms
Grayscale
INTER_LINEAR 413.789ms
INTER_LINEAR GPU 1027.85ms
INTER_LINEAR GPU MEMIO 9568.99ms
INTER_NEAREST 978.89ms
INTER_NEAREST GPU 747.621ms
INTER_NEAREST GPU MEMIO 9346.28ms
BORDER_DEFAULT 19266.7ms
BORDER_REFLECT 19274.1ms
BORDER_REPLICATE 19300.8ms
BORDER_WRAP 19386.3ms
PYR GPU 2272.7ms
#include "opencv2/opencv.hpp"
#include "opencv2/cudaimgproc.hpp"
#include "opencv2/cudawarping.hpp"
#include <iostream>
#include <string>
#include <chrono>
using namespace std;
using namespace cv;
template <typename T>
double resizePerfEval(const Mat& frame, unsigned int n, T resizeFlag) {
auto start = chrono::steady_clock::now();
for (auto i = 0; i < n; i++) {
Mat temp;
resize(frame, temp, Size(), 0.5, 0.5, resizeFlag);
}
return chrono::duration <double, milli>(chrono::steady_clock::now() - start).count();
}
template <typename T>
double pyramidPerfEval(const Mat& frame, unsigned int n, T border) {
auto start = chrono::steady_clock::now();
Size s(frame.cols / 2, frame.rows / 2);
for (auto i = 0; i < n; i++) {
Mat tmp;
pyrDown(frame, tmp, s, border);
}
return chrono::duration <double, milli>(chrono::steady_clock::now() - start).count();
}
template <typename T>
double resizePerfEvalGPU(const Mat& frame, unsigned int n, T resizeFlag, bool uploadDownload=false) {
auto start = chrono::steady_clock::now();
Mat tmp;
cuda::GpuMat frame_d, temp;
frame_d.upload(frame);
for (auto i = 0; i < n; i++) {
cuda::resize(frame_d, temp, Size(), 0.5, 0.5, resizeFlag);
if (uploadDownload) {
temp.download(tmp);
frame_d.upload(frame);
}
}
return chrono::duration <double, milli>(chrono::steady_clock::now() - start).count();
}
double pyramidPerfEvalGPU(const Mat& frame, unsigned int n, bool uploadDownload = false) {
auto start = chrono::steady_clock::now();
Mat tmp;
cuda::GpuMat frame_d, temp;
frame_d.upload(frame);
for (auto i = 0; i < n; i++) {
cuda::pyrDown(frame_d, temp);
if (uploadDownload) {
temp.download(tmp);
frame_d.upload(frame);
}
}
return chrono::duration <double, milli>(chrono::steady_clock::now() - start).count();
}
void runTest(const Mat& frame, unsigned int n) {
cout << "INTER_LINEAR " << resizePerfEval(frame, n, INTER_LINEAR) << "ms" << endl;
cout << "INTER_LINEAR GPU " << resizePerfEvalGPU(frame, n, INTER_LINEAR) << "ms" << endl;
cout << "INTER_LINEAR GPU MEMIO " << resizePerfEvalGPU(frame, n, INTER_LINEAR, true) << "ms" << endl;
cout << "INTER_NEAREST " << resizePerfEval(frame, n, INTER_NEAREST) << "ms" << endl;
cout << "INTER_NEAREST GPU " << resizePerfEvalGPU(frame, n, INTER_NEAREST) << "ms" << endl;
cout << "INTER_NEAREST GPU MEMIO " << resizePerfEvalGPU(frame, n, INTER_NEAREST, true) << "ms" << endl;
cout << "BORDER_DEFAULT " << pyramidPerfEval(frame, n, BORDER_DEFAULT) << "ms" << endl;
cout << "BORDER_REFLECT " << pyramidPerfEval(frame, n, BORDER_REFLECT) << "ms" << endl;
cout << "BORDER_REPLICATE " << pyramidPerfEval(frame, n, BORDER_REPLICATE) << "ms" << endl;
cout << "BORDER_WRAP " << pyramidPerfEval(frame, n, BORDER_WRAP) << "ms" << endl;
cout << "PYR GPU " << pyramidPerfEvalGPU(frame, n) << "ms" << endl;
}
int main(int argc, char* argv[])
{
Mat gsframe, frame = Mat::ones(Size(1920, 1080), CV_8UC3);
randu(frame, Scalar::all(0), Scalar::all(255));
cvtColor(frame, gsframe, CV_BGR2GRAY);
auto n = 10000;
cout << "Colour" << endl;
runTest(frame, n);
cout << endl << "Grayscale" << endl;
runTest(gsframe, n);
return 0;
}
If the algorithm is running on a PC an alternative is to do resizing on a CUDA enabled GPU. You would have to be careful in selecting the card however as you would need the memory bandwidth to be sufficiently higher to accommodate for the time taken to upload and download images from GPU memory.
Note from the results that CPU beats GPU on grayscale and when the image is unavailable on GPU memory. If the image is available on the GPU memory then for Colour its a 3.5X speed-up using GPU (specially for very large image sizes). For high end applications NVIDIA capture cards with GPUDirect can be used to achieve this.
Benchmarks were carried out on a Xeon E5 v2 @ 3.0Ghz 680GTX