I write a simple for-loop which assigns a constant to an array.
#include <iostream>
#include <vector>
#include <cstdlib>
#include "omp.h"
using namespace std;
int nr_threads = 1;
long J = 10000000;
long K = 40;
int main(int argc, char* argv[])
{
nr_threads = atoi(argv[1]);
vector<double> H_U_d(J*K, 1);
double start_time = omp_get_wtime();
#pragma omp parallel for num_threads(nr_threads) schedule(static)
for(long j = 0; j < J*K; j++)
{
H_U_d[j] = 1;
}
cout << omp_get_wtime()-start_time << endl;
return 0;
}
and I use gcc to compile it, g++ main.cpp -o test_speedup -fopenmp
and test it on a 12-cores machine.
My system is Ubuntu 14.04.3 and cpu is Intel(R) Xeon(R) CPU E5-2620 0 @ 2.00GHz with 128GB RAM.
If no optimization is applied, we can have such result:
➜ ~ ./test_speedup 1
2.95739
➜ ~ ./test_speedup 8
0.483756
speedup is around 6.
However if I use -O3 to optimize it, g++ main.cpp -o test_speedup -fopenmp -O3
the result is
➜ ~ ./test_speedup 1
0.379158
➜ ~ ./test_speedup 8
0.265842
speedup is poor.
How does gcc optimize the loop? are there any solutions could avoid this?