I am developing an program that involves a lot of low latency hard-real time matrix operations. I am using Eigen 3 library for the same.
I wish to use AVX-512F SIMD vectorization in production for performance acceleration.
I am currently experimenting on Ubuntu and have used vcpkg package manager to install Eigen3 library. Currently my computer supports AVX2 and I will enable AVX-512F from BIOS later.
I am using
objdump -d main.o | grep zmm
objdump -d main.o | grep ymm
objdump -d main.o | grep xmm
command to analyze which registers are being used.
objdump -d main.o | grep zmm
returns empty output.
objdump -d main.o | grep ymm
returns empty outout.
objdump -d main.o | grep xmm
returns register access and operation instruction.
I wish to know is my g++ compiler generating AVX1 (AVX-128) code for Eigen3 library.
How do I verify if proper AVX2 or AVX-512F SIMD code is being generated.
Small Snippet of objdump
3d84: 0f 28 08 movaps (%rax),%xmm1
3d87: 0f 29 4d d0 movaps %xmm1,-0x30(%rbp)
3d8b: 0f 29 45 e0 movaps %xmm0,-0x20(%rbp)
3d8f: 0f 28 45 d0 movaps -0x30(%rbp),%xmm0
3d93: 0f 12 45 e8 movlps -0x18(%rbp),%xmm0
3d9f: 0f 28 08 movaps (%rax),%xmm1
3da2: 0f 29 4d b0 movaps %xmm1,-0x50(%rbp)
3da6: 0f 29 45 c0 movaps %xmm0,-0x40(%rbp)
3daa: 0f 28 45 b0 movaps -0x50(%rbp),%xmm0
3dae: 0f 58 45 c0 addps -0x40(%rbp),%xmm0
3db2: 0f 29 45 80 movaps %xmm0,-0x80(%rbp)
3db6: 0f 28 45 80 movaps -0x80(%rbp),%xmm0
3dba: 0f 28 4d 80 movaps -0x80(%rbp),%xmm1
3dbe: 0f c6 c1 01 shufps $0x1,%xmm1,%xmm0
Update
Code
#include <eigen3/Eigen/Core>
#include <eigen3/Eigen/Dense>
using namespace Eigen;
int main()
{
// Matrix4f a, b, cadd, cmul, ci, ct, d;
// a = Matrix4f::Random();
// b = Matrix4f::Random();
MatrixXf a(100, 100),b(100, 100),cadd(100, 100), cmul(100, 100), ci(100, 100), ct(100, 100);
a = MatrixXf::Random(100, 100);
b = MatrixXf::Random(100, 100);
cadd = a + b;
cmul = a * b;
ci = cadd.inverse();
ct = cadd.transpose();
}
Build Command
g++ -Wall -fexceptions -I/home/user/vcpkg/installed/x64-linux/include -c /home/user/Desktop/VectorClass/main.cpp -o obj/Debug/main.o
g++ -L/home/user/vcpkg/installed/x64-linux/debug/lib -o bin/Debug/VectorClass obj/Debug/main.o -mavx2 -mtune=native -host=native -march=native
Final Object Dump
objdump -d main.o | grep ymm
14f: c5 fe 7f 45 98 vmovdqu %ymm0,-0x68(%rbp)
231: c5 fd 7f 85 50 fe ff vmovdqa %ymm0,-0x1b0(%rbp)
objdump -d main.o | grep zmm
a1: 62 f1 7c 48 28 74 24 vmovaps 0x80(%rsp),%zmm6
ac: 62 f1 4c 48 58 44 24 vaddps 0x40(%rsp),%zmm6,%zmm0
b4: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0xc0(%rsp)
228: 62 f1 7c 48 28 7c 24 vmovaps 0x180(%rsp),%zmm7
230: 62 f1 7c 48 29 7c 24 vmovaps %zmm7,0x100(%rsp)
2d0: 62 f1 7d 48 6f 05 00 vmovdqa32 0x0(%rip),%zmm0 # 2da <main+0x2da>
2dd: 62 f2 7d 48 16 44 24 vpermps 0xc0(%rsp),%zmm0,%zmm0
2e5: 62 f1 7c 48 29 44 24 vmovaps %zmm0,0x180(%rsp)
Build Command
g++ -Wall -fexceptions -I/home/user/vcpkg/installed/x64-linux/include -c main.cpp -o main.out -mavx512f -mfma -mtune=native -host=native -march=native -mprefer-vector-width=512 -O3 -fno-math-errno -ffinite-math-only -fno-rounding-math -funsafe-math-optimizations