The following minimal benchmark rebuilds single-threaded code with -O3 -march=native
on each machine, multiplying matrices that are either square or highly non-square (one dimension = 2).
#include <Eigen/Core>
#include <chrono>
#include <iomanip>
#include <iostream>
std::string show_shape(const Eigen::MatrixXf& m)
{
return "(" + std::to_string(m.rows()) + ", " + std::to_string(m.cols()) + ")";
}
void measure_gemm(const Eigen::MatrixXf& a, const Eigen::MatrixXf& b)
{
typedef std::chrono::high_resolution_clock clock;
const auto start_time_ns = clock::now().time_since_epoch().count();
const std::size_t runs = 10;
for (size_t i = 0; i < runs; ++i)
{
Eigen::MatrixXf c = a * b;
}
const auto end_time_ns = clock::now().time_since_epoch().count();
const auto elapsed_ms = (end_time_ns - start_time_ns) / 1000000;
std::cout << std::setw(5) << elapsed_ms <<
" ms <- " << show_shape(a) + " * " + show_shape(b) << std::endl;
}
int main()
{
measure_gemm(Eigen::MatrixXf::Zero(2, 4096), Eigen::MatrixXf::Zero(4096, 16384));
measure_gemm(Eigen::MatrixXf::Zero(1536, 1536), Eigen::MatrixXf::Zero(1536, 1536));
measure_gemm(Eigen::MatrixXf::Zero(16384, 4096), Eigen::MatrixXf::Zero(4096, 2));
}
which can be easily run with that Dockerfile
FROM ubuntu:20.04
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update
RUN apt-get install -y build-essential wget cmake git lshw
RUN git clone -b '3.3.7' --single-branch --depth 1 https://github.com/eigenteam/eigen-git-mirror && cd eigen-git-mirror && mkdir -p build && cd build && cmake .. && make && make install && ln -s /usr/local/include/eigen3/Eigen /usr/local/include/Eigen
#ADD wide_vs_tall.cpp .
RUN wget https://gist.githubusercontent.com/Dobiasd/78b32fd4aa2fc83d8da3935d690c623a/raw/5626198a533473157d6a19a824f20ebe8678e9cf/wide_vs_tall.cpp
RUN g++ -std=c++14 -O3 -march=native wide_vs_tall.cpp -o main
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
RUN lscpu
RUN lshw -short -C memory
RUN ./main
wget https://gist.githubusercontent.com/Dobiasd/8e27e5a96989fa8e4f942900fe609998/raw/8a07fee1a015c8c8e47066a7ac92891850b70a14/Dockerfile
docker build --rm .
produces the following results:
Tobias' workstation (Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz
)
359 ms <- (2, 4096) * (4096, 16384)
761 ms <- (1536, 1536) * (1536, 1536)
597 ms <- (16384, 4096) * (4096, 2)
sysbench --cpu-max-prime=20000 --num-threads=1 cpu run
CPU speed:
events per second: 491.14
Keith's workstation (Intel(R) Core(TM) i9-9960X CPU @ 3.10GHz
)
437 ms <- (2, 4096) * (4096, 16384)
724 ms <- (1536, 1536) * (1536, 1536)
789 ms <- (16384, 4096) * (4096, 2)
sysbench --cpu-max-prime=20000 --num-threads=1 cpu run
CPU speed:
events per second: 591.58
Why is Tobias' workstation faster in 2 of 3 GEMMs compared to Keith's workstation, despite Keith's workstation showing better sysbench results? I'd expect the i9-9960X to be much faster because its -march=native
includes AVX512, and the single-core clock speed is higher.