I have this algoritmo which scans an image and for each pixel p calculates a 256 bins histogram in which values of the pixel inside a patch around p are saved. The algorithm needs to be O(1) so a need to do many histogram addition, I'd like to make the algorithm faster by parallelizing the histogram addition with OpenMP, so I added #pragma omp parallel for
before each for (just the ones with histogram additions) but it actually makes it 10 times slower. I think i need to create a parallel region outside but I don't understand how.
Also, I'm afraid the overhead generated by OpenMP overcomes the speed gained by the parallelization of a 256-for, but I don't know for sure
for (int i = 0; i < src.rows; i++) {
for (int j = 0; j < src.cols; j++) {
if (j == 0)
{ ... }
else {
if (j > side/2) { // subtract col
for (int h = 0; h < 256; h++) // THIS ONE
histogram[h] -= colHisto[j - (side/2) - 1][h];
}
if (j < src.cols - side/2) { // add column
if (i > side/2) { // subtract pixel
colHisto[j + side/2][src.at<uchar>(i - side/2 - 1, j + side/2)]--;
}
if (i < src.rows - side/2) { // add pixel
colHisto[j + side/2][src.at<uchar>(i + side/2, j + side/2)]++;
}
for (int h = 0; h < 256; h++) // AND THIS ONE
histogram[h] += colHisto[j + side/2][h];
}
}
}
}