I have implemented Summed Area Table (or Integral image) in C++ using OpenMP.
The problem is that the Sequential code is always faster then the Parallel code even changing the number of threads and image sizes.
For example I tried images from (100x100) to (10000x10000) and threads from 1 to 64, but none of the combination is ever faster.
I also tried this code in different machines like:
- Mac OSX 1,4 GHz Intel Core i5 dual core
- Mac OSX 2,3 GHz Intel Core i7 quad core
- Ubuntu 16.04 Intel Xeon E5-2620 2,4 GHz 12 cores
The time has been measured with OpenMP function: omp_get_wtime()
.
For compiling I use: g++ -fopenmp -Wall main.cpp
.
Here is the parallel code:
void transpose(unsigned long *src, unsigned long *dst, const int N, const int M) {
#pragma omp parallel for
for(int n = 0; n<N*M; n++) {
int i = n/N;
int j = n%N;
dst[n] = src[M*j + i];
}
}
unsigned long * integralImageMP(uint8_t*x, int n, int m){
unsigned long * out = new unsigned long[n*m];
unsigned long * rows = new unsigned long[n*m];
#pragma omp parallel for
for (int i = 0; i < n; ++i)
{
rows[i*m] = x[i*m];
for (int j = 1; j < m; ++j)
{
rows[i*m + j] = x[i*m + j] + rows[i*m + j - 1];
}
}
transpose(rows, out, n, m);
#pragma omp parallel for
for (int i = 0; i < n; ++i)
{
rows[i*m] = out[i*m];
for (int j = 1; j < m; ++j)
{
rows[i*m + j] = out[i*m + j] + rows[i*m + j - 1];
}
}
transpose(rows, out, m, n);
delete [] rows;
return out;
}
Here is the sequential code:
unsigned long * integralImage(uint8_t*x, int n, int m){
unsigned long * out = new unsigned long[n*m];
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < m; ++j)
{
unsigned long val = x[i*m + j];
if (i>=1)
{
val += out[(i-1)*m + j];
if (j>=1)
{
val += out[i*m + j - 1] - out[(i-1)*m + j - 1];
}
} else {
if (j>=1)
{
val += out[i*m + j -1];
}
}
out[i*m + j] = val;
}
}
return out;
}
I also tried without the transpose
but it was even slower probably because the cache accesses.
An example of calling code:
int main(int argc, char **argv){
uint8_t* image = //read image from file (gray scale)
int height = //height of the image
int width = //width of the image
double start_omp = omp_get_wtime();
unsigned long* integral_image_parallel = integralImageMP(image, height, width); //parallel
double end_omp = omp_get_wtime();
double time_tot = end_omp - start_omp;
std::cout << time_tot << std::endl;
start_omp = omp_get_wtime();
unsigned long* integral_image_serial = integralImage(image, height, width); //sequential
end_omp = omp_get_wtime();
time_tot = end_omp - start_omp;
std::cout << time_tot << std::endl;
return 0;
}
Each thread is working on a block of rows (maybe an illustration of what each thread is doing can be useful):
Where ColumnSum is done transposing the matrix and repeating RowSum.