Consider the following code:
#include <iostream>
#include <chrono>
#include <vector>
#include <numeric>
#include <cmath>
#include <omp.h>
using namespace std;
typedef std::chrono::steady_clock myclock;
double measure_time(myclock::time_point begin, myclock::time_point end)
{
return std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count()/(double)1e6;
}
int main()
{
int n = 20000;
vector<double> v(n);
iota(v.begin(), v.end(), 1.5);
vector< vector<double> > D(n, vector<double>(n,0.0));
myclock::time_point begin, end;
begin = myclock::now();
//#pragma omp parallel for collapse(2)
//#pragma omp parallel for
for(size_t i = 0; i < n - 1; i++){
for(size_t j = i+1; j < n; j++){
double d = sqrt(v[i]*v[i] + v[j]*v[j] + 1.5*v[i]*v[j]);
D[i][j] = d;
D[j][i] = d;
}
}
end= myclock::now();
double time = measure_time(begin, end);
cout<<"Time: "<<time<<" (s)"<<endl;
return 0;
}
For compiling:
g++ -std=c++11 -fopenmp -o main main.cpp
I obtained the following run time:
- With
#pragma omp parallel for collapse(2)
: 7.9425 (s) - With
#pragma omp parallel for
: 3.73262 (s) - Without OpenGM: 11.0935 (s)
System settings: Linux Mint 18.3 64-bit, g++ 5.4.0, quad-core processor.
I would expect the first to be faster than the second (which parallelizes only the outer loop) and much faster than the third.
What did I do wrong please? The first and the second both ran on all 8 threads.
Thank you in advance for your help!