I compiled the code below on VS C++ 2017 with /openmp /O2 /arch::AVX. When running with 8 threads the output is:
dt_loops = 1562ms dt_eigen = 26 ms
I expected the A * B to be faster than my own handmade loops but I did not expect such a large difference. Is there anything wrong with my code? And if not how can Eigen3 do it so much faster.
I'm not very experienced in using OpenMP or any other parallelization method. I tried diferent loop orders but the one below is the fastest.
#include <iostream>
#include <chrono>
#include <Eigen/Dense>
int main() {
std::chrono::time_point<std::chrono::system_clock> start1, end1, start2, end2;
int n = 1000;
Eigen::MatrixXd A = Eigen::MatrixXd::Random(n, n);
Eigen::MatrixXd B = Eigen::MatrixXd::Random(n, n);
Eigen::MatrixXd C = Eigen::MatrixXd::Zero(n, n);
start1 = std::chrono::system_clock::now();
int i, j, k;
#pragma omp parallel for private(i, j, k)
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
for (k = 0; k < n; ++k) {
C(i, j) += A(i, k) * B(k, j);
}
}
}
end1 = std::chrono::system_clock::now();
std::cout << "dt_loops = " << std::chrono::duration_cast<std::chrono::milliseconds>(end1-start1).count() << " ms" << std::endl;
Eigen::MatrixXd D = Eigen::MatrixXd::Zero(n, n);
start2 = std::chrono::system_clock::now();
D = A * B;
end2 = std::chrono::system_clock::now();
std::cout << "dt_eigen = " << std::chrono::duration_cast<std::chrono::milliseconds>(end2-start2).count() << " ms" << std::endl;
}