I have the following loop I use for the force calculations in a MD-simulation:
struct data_str{
int a = 0, b = 0;
data_str(int a, int b) : a(a), b(b)
{
};
};
void work_func_both(std::vector<data_str> *data, int dist)
{
for(unsigned int i = 0; i < data->size()-1; i++)
{
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
It is a reduced version of the force-calculation loop I have in my application. My main problem is: It takes time, especially for larger vectors to calculate everything. Thus I wanted to optimize it, and tested the following program:
#include <iostream>
#include <omp.h>
#include <vector>
#include <chrono>
#define VECTOR_SIZE 1e3
#define ROUNDS 1e5
struct data_str{
int a = 0, b = 0;
data_str(int a, int b) : a(a), b(b)
{
};
};
void work_func_both(std::vector<data_str> *data, int dist)
{
#pragma omp parallel for
for(unsigned int i = 0; i < data->size()-1; i++)
{
#pragma omp parallel for
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
void work_func_first(std::vector<data_str> *data, int dist)
{
#pragma omp parallel for
for(unsigned int i = 0; i < data->size()-1; i++)
{
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
void work_func_second(std::vector<data_str> *data, int dist)
{
for(unsigned int i = 0; i < data->size()-1; i++)
{
#pragma omp parallel for
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
void work_func_none(std::vector<data_str> *data, int dist)
{
for(unsigned int i = 0; i < data->size()-1; i++)
{
for(unsigned int j = i+1; j < data->size(); j++)
{
if(abs((*data)[i].a - (*data)[j].a) < dist)
{
(*data)[i].b += 3;
(*data)[j].b -= 1;
}
}
}
}
int main(void)
{
std::vector<data_str> counter_vec1, counter_vec2, counter_vec3, counter_vec4;
for(unsigned int i = 0; i < VECTOR_SIZE; i++)
{
counter_vec1.push_back(data_str(i, i));
counter_vec2.push_back(data_str(i, i));
counter_vec3.push_back(data_str(i, i));
counter_vec4.push_back(data_str(i, i));
}
omp_set_num_threads(8);
const int length = 2;
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < ROUNDS; i++)
{
work_func_both(&counter_vec1, length);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
auto duration_both = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
t1 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < ROUNDS; i++)
{
work_func_first(&counter_vec2, length);
}
t2 = std::chrono::high_resolution_clock::now();
auto duration_first = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
t1 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < ROUNDS; i++)
{
work_func_second(&counter_vec3, length);
}
t2 = std::chrono::high_resolution_clock::now();
auto duration_second = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
t1 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < ROUNDS; i++)
{
work_func_none(&counter_vec4, length);
}
t2 = std::chrono::high_resolution_clock::now();
auto duration_none = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
std::cout << "Both took " << duration_both/ROUNDS << ", first took " << duration_first/ROUNDS << ", second took " << duration_second/ROUNDS << " and none took " << duration_none/ROUNDS << '\n';
return 0;
};
Compilation line is
g++-5 -std=c++14 -O3 -fopenmp main.cpp -lgomp -o main
and the resulting output is
Both took 593.704, first took 547.549, second took 3394.53 and none took 856.049
My problem is now: Why is only the _second()
-function returning correct results when comparing to the original version, and why is it so much slower in comparison to all the other loops? How could I optimize that loop for making it faster in total?