I'm trying to parallelize a simple loop using OpenMP. Below is my code:
#include <iostream>
#include <omp.h>
#include <time.h>
#define SIZE 10000000
float calculate_time(clock_t start, clock_t end) {
return (float) ((end - start) / (double) CLOCKS_PER_SEC) * 1000;
}
void openmp_test(double * x, double * y, double * res, int threads){
clock_t start, end;
std::cout << std::endl << "OpenMP, " << threads << " threads" << std::endl;
start = clock();
#pragma omp parallel for num_threads(threads)
for(int i = 0; i < SIZE; i++){
res[i] = x[i] * y[i];
}
end = clock();
for(int i = 1; i < SIZE; i++){
res[0] += res[i];
}
std::cout << "time: " << calculate_time(start, end) << std::endl;
std::cout << "result: " << res[0] << std::endl;
}
int main() {
double *dbl_x = new double[SIZE];
double *dbl_y = new double[SIZE];
double *res = new double[SIZE];
for(int i = 0; i < SIZE; i++){
dbl_x[i] = i % 1000;
dbl_y[i] = i % 1000;
}
openmp_test(dbl_x, dbl_y, res, 1);
openmp_test(dbl_x, dbl_y, res, 1);
openmp_test(dbl_x, dbl_y, res, 2);
openmp_test(dbl_x, dbl_y, res, 4);
openmp_test(dbl_x, dbl_y, res, 8);
delete [] dbl_x;
delete [] dbl_y;
delete [] res;
return 0;
}
I compile it as below
g++ -O3 -fopenmp main.cpp -o ompTest
However, after running the test on a Core-i7, I have the following results:
OpenMP, 1 threads time: 31.468 result: 3.32834e+12
OpenMP, 1 threads time: 18.663 result: 3.32834e+12
OpenMP, 2 threads time: 34.393 result: 3.32834e+12
OpenMP, 4 threads time: 56.31 result: 3.32834e+12
OpenMP, 8 threads time: 108.54 result: 3.32834e+12
I don't understand what I'm doing wrong? Why OpenMP slows down the calculations?
And also, why the first result is significantly slower than the second (both with 1 omp thread)?
My test environment: Core i7-4702MQ CPU @ 2.20GHz, Ubuntu 18.04.2 LTS, g++ 7.4.0.