I wrote a simple program to see the execution performance for parallel algorithm. Here is the code :
#include <execution>
#include <vector>
#include <numeric>
#include <iostream>
#include <chrono>
int main()
{
std::vector<float> data(1000000, 0);
std::iota(std::begin(data), std::end(data), 0);
auto t1 = std::chrono::high_resolution_clock::now();
for (auto& item : data) {
item = item*item;
}
auto t2 = std::chrono::high_resolution_clock::now();
/* Getting number of milliseconds as a double. */
std::chrono::duration<double, std::milli> ms_double = t2 - t1;
std::cout << "non-optimized version : " << ms_double.count() << " milisecs" << std::endl;
std::iota(std::begin(data), std::end(data), 0);
t1 = std::chrono::high_resolution_clock::now();
std::for_each(std::execution::par, std::begin(data), std::end(data),
[](float& item) {
item = item*item;
});
t2 = std::chrono::high_resolution_clock::now();
ms_double = t2 - t1;
std::cout << "paralell version : " << ms_double.count() << " milisecs" << std::endl;
return 0;
}
But to my surprise I see no improvement at all - regardless of amount of data in vector. What's wrong with STL algorithms. The compiler is gcc-10