I tried to rewrite code from Fortran to C++ with a 2000*2000 matrix multiplication implements through Eigen library. I found that for loop in Eigen is much slower (>3x) than do loop in Fortran. The codes are listed below:
test.f90
program main
implicit none
integer :: n,i,j,k
integer :: tic,toc
real(8),ALLOCATABLE ::a(:,:),b(:,:),c(:,:)
real(8) :: s
n = 2000
allocate(a(n,n),b(n,n),c(n,n))
do i=1,n
do j =1,n
a(j,i) = i * 1.0
b(j,i) = i * 1.0
enddo
enddo
call system_clock(tic)
do j=1,n
do i=1,n
s = 0.0
do k=1,n
s = s + a(i,k) * b(k,j)
enddo
c(i,j) = s
enddo
enddo
call system_clock(toc)
print*,'Fortran with loop:', (toc - tic) / 1000.0
call system_clock(tic)
c = matmul(a,b)
call system_clock(toc)
print*,'Fortran with matmul:', (toc - tic) / 1000.0
DEALLOCATE(a,b,c)
end
test.cpp
#include<Eigen/Core>
#include<time.h>
#include<iostream>
using Eigen::MatrixXd;
int main(){
int n = 2000;
MatrixXd a(n,n),b(n,n),c(n,n);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
a(i,j) = i * 1.0;
b(i,j) = j * 1.0;
}
}
clock_t tic,toc;
tic = clock();
for(int j=0;j<n;j++){
for(int i=0;i<n;i++){
double s= 0.0;
for(int k=0;k<n;k++){
s += a(i,k) * b(k,j);
}
c(i,j) = s;
}
}
toc = clock();
std::cout << (double)((toc - tic)) / CLOCKS_PER_SEC << std::endl;
tic = clock();
c= a * b;
toc = clock();
std::cout << (double)((toc - tic)) / CLOCKS_PER_SEC << std::endl;
}
Compiled by(with gcc-8.4, in Ubuntu-18.04)
gfortran test.f90 -O3 -march=native -o testf
g++ test.cpp -O3 -march=native -I/path/to/eigen -o testcpp
And I get results:
Fortran with loop: 10.9700003
Fortran with matmul: 0.834999979
Eigen with loop: 38.2188
Eigen with *: 0.40625
The internal implementation is of comparable speed, but why Eigen is much slower for the loop implementation?