I want to use the AVX instruction set to accelerate the convolution operation from 8-channel image to 8-channel image. I use a 3x3 convolution kernel. My code is as follows:
const float* kptr = kernels;
const float* bptr = biases;
__m256 _out0 = _mm256_loadu_ps(bptr);
__m256 _out1 = _mm256_setzero_ps();
__m256 _out2 = _mm256_setzero_ps();
for (int i = 0; i < 8; i ++)
{
const __m256 _r00 = _mm256_broadcast_ss(tl + i);
const __m256 _r01 = _mm256_broadcast_ss(tc + i);
const __m256 _r02 = _mm256_broadcast_ss(tr + i);
const __m256 _r03 = _mm256_broadcast_ss(ml + i);
const __m256 _r04 = _mm256_broadcast_ss(mc + i);
const __m256 _r05 = _mm256_broadcast_ss(mr + i);
const __m256 _r06 = _mm256_broadcast_ss(bl + i);
const __m256 _r07 = _mm256_broadcast_ss(bc + i);
const __m256 _r08 = _mm256_broadcast_ss(br + i);
const __m256 _k00 = _mm256_loadu_ps(kptr + i * 72);
const __m256 _k01 = _mm256_loadu_ps(kptr + i * 72 + 8);
const __m256 _k02 = _mm256_loadu_ps(kptr + i * 72 + 16);
const __m256 _k03 = _mm256_loadu_ps(kptr + i * 72 + 24);
const __m256 _k04 = _mm256_loadu_ps(kptr + i * 72 + 32);
const __m256 _k05 = _mm256_loadu_ps(kptr + i * 72 + 40);
const __m256 _k06 = _mm256_loadu_ps(kptr + i * 72 + 48);
const __m256 _k07 = _mm256_loadu_ps(kptr + i * 72 + 56);
const __m256 _k08 = _mm256_loadu_ps(kptr + i * 72 + 64);
_out0 = _mm256_fmadd_ps(_r00, _k00, _out0);
_out1 = _mm256_fmadd_ps(_r01, _k01, _out1);
_out2 = _mm256_fmadd_ps(_r02, _k02, _out2);
_out0 = _mm256_fmadd_ps(_r03, _k03, _out0);
_out1 = _mm256_fmadd_ps(_r04, _k04, _out1);
_out2 = _mm256_fmadd_ps(_r05, _k05, _out2);
_out0 = _mm256_fmadd_ps(_r06, _k06, _out0);
_out1 = _mm256_fmadd_ps(_r07, _k07, _out1);
_out2 = _mm256_fmadd_ps(_r08, _k08, _out2);
}
_out0 = _mm256_max_ps(_mm256_add_ps(_out0, _mm256_add_ps(_out1, _out2)), _mm256_setzero_ps());
_mm256_storeu_ps(outMat, _out0);
On Ryzen, this is very effective. Tested on R5 2600 and R5 3500U, I can get 2-4 times performance improvement compared to ordinary C++ code with compiler optimization . But on Intel Core CPU, It is even 50% slower than ordinary C++ code with compiler optimization , tested on i7 8750H and i3 4170, both of them. Actually, 3500U is 4 times faster than i7 8750H in this case.
I am confused about this. I found that the most time-consuming instruction in Intel CPU is the fmadd instruction, but it still have no improvement after replacing fmadd with the equivalent avx instruction.
I also considered the limitation of the number of registers, but after trying to reduce the number of __mm256 variables, the situation may get worse.
The compiler and parameters are the same, I compiled with msvc2019, and I even used the same binary.
The memory layout of weights(kptr
) is CHWB, input image pixels(tl
to br
) is BHWC.
During the test, I noticed that in the same scenario, i7 8750h is full load, while the 2600 is about 35%, and the performance is 8 times that of the former.
Any suggestions?
I didn't find a good way to disassemble the binary compiled by MSVC, so I compiled it under Linux and disassembled it with GDB. Here's what I got using GDB disassembly:
-g -fopenmp -lpthread -mavx2 -mfma -O3
/usr/lib/gcc/x86_64-linux-gnu/9/include/avxintrin.h:
898 return *(__m256_u *)__P;
0x00007fffff710967 <+135>: vxorps %xmm1,%xmm1,%xmm1
0x00007fffff71096b <+139>: lea 0x4(,%r13,4),%r13
0x00007fffff710973 <+147>: lea 0x4(,%rdi,4),%rdi
0x00007fffff71097b <+155>: vmovaps %ymm1,%ymm3
0x00007fffff71097f <+159>: mov (%rax),%r10
0x00007fffff710982 <+162>: mov 0x10(%r9),%rax
0x00007fffff710986 <+166>: lea 0x4(,%rsi,4),%rsi
0x00007fffff71098e <+174>: lea (%r11,%rdi,1),%rbx
0x00007fffff710992 <+178>: lea (%r11,%rsi,1),%r12
0x00007fffff710996 <+182>: lea (%rdx,%rsi,1),%r9
0x00007fffff71099a <+186>: add %r13,%r11
0x00007fffff71099d <+189>: add %rcx,%rsi
0x00007fffff7109a0 <+192>: mov (%rax),%rax
0x00007fffff7109a3 <+195>: vmovups (%r10),%xmm7
0x00007fffff7109a8 <+200>: vinsertf128 $0x1,0x10(%r10),%ymm7,%ymm0
/home/tianzer/Anime4KCPP/Anime4KCore/src/CPUCNNProcessor.cpp:
390 for (int i = 0; i < 8; i += 2)
=> 0x00007fffff7109af <+207>: lea (%rdx,%rdi,1),%r10
0x00007fffff7109b3 <+211>: add %r13,%rdx
0x00007fffff7109b6 <+214>: add %rcx,%rdi
0x00007fffff7109b9 <+217>: add %r13,%rcx
0x00007fffff7109bc <+220>: lea 0x900(%rax),%r13
/usr/lib/gcc/x86_64-linux-gnu/9/include/avxintrin.h:
735 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
0x00007fffff7109c3 <+227>: vbroadcastss -0x4(%rbx),%ymm11
0x00007fffff7109c9 <+233>: vmovups (%rax),%xmm5
0x00007fffff7109cd <+237>: add $0x8,%rbx
0x00007fffff7109d1 <+241>: add $0x240,%rax
0x00007fffff7109d7 <+247>: vbroadcastss -0x4(%r11),%ymm6
0x00007fffff7109dd <+253>: vbroadcastss -0x4(%r9),%ymm8
0x00007fffff7109e3 <+259>: add $0x8,%r12
0x00007fffff7109e7 <+263>: add $0x8,%r11
0x00007fffff7109eb <+267>: vbroadcastss -0x4(%rdx),%ymm7
0x00007fffff7109f1 <+273>: vbroadcastss -0x4(%rsi),%ymm4
0x00007fffff7109f7 <+279>: add $0x8,%r10
0x00007fffff7109fb <+283>: add $0x8,%r9
0x00007fffff7109ff <+287>: vbroadcastss -0xc(%r12),%ymm10
0x00007fffff710a06 <+294>: vbroadcastss -0xc(%r10),%ymm9
0x00007fffff710a0c <+300>: add $0x8,%rdx
0x00007fffff710a10 <+304>: add $0x8,%rdi
0x00007fffff710a14 <+308>: vbroadcastss -0x4(%rcx),%ymm2
0x00007fffff710a1a <+314>: vbroadcastss -0xc(%rdi),%ymm12
/usr/lib/gcc/x86_64-linux-gnu/9/include/fmaintrin.h:
65 return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
0x00007fffff710a20 <+320>: add $0x8,%rsi
0x00007fffff710a24 <+324>: add $0x8,%rcx
0x00007fffff710a28 <+328>: vinsertf128 $0x1,-0x230(%rax),%ymm5,%ymm5
0x00007fffff710a32 <+338>: vfmadd231ps %ymm5,%ymm11,%ymm0
0x00007fffff710a37 <+343>: vmovups -0x220(%rax),%xmm5
0x00007fffff710a3f <+351>: vinsertf128 $0x1,-0x210(%rax),%ymm5,%ymm5
0x00007fffff710a49 <+361>: vfmadd231ps %ymm5,%ymm10,%ymm3
0x00007fffff710a4e <+366>: vmovups -0x200(%rax),%xmm5
0x00007fffff710a56 <+374>: vinsertf128 $0x1,-0x1f0(%rax),%ymm5,%ymm5
0x00007fffff710a60 <+384>: vfmadd231ps %ymm5,%ymm6,%ymm1
0x00007fffff710a65 <+389>: vmovups -0x1e0(%rax),%xmm6
0x00007fffff710a6d <+397>: vinsertf128 $0x1,-0x1d0(%rax),%ymm6,%ymm11
0x00007fffff710a77 <+407>: vmovups -0x1c0(%rax),%xmm6
0x00007fffff710a7f <+415>: vinsertf128 $0x1,-0x1b0(%rax),%ymm6,%ymm10
0x00007fffff710a89 <+425>: vfmadd132ps %ymm11,%ymm0,%ymm9
0x00007fffff710a8e <+430>: vfmadd132ps %ymm10,%ymm3,%ymm8
0x00007fffff710a93 <+435>: vmovups -0x1a0(%rax),%xmm3
0x00007fffff710a9b <+443>: vinsertf128 $0x1,-0x190(%rax),%ymm3,%ymm6
0x00007fffff710aa5 <+453>: vfmadd132ps %ymm6,%ymm1,%ymm7
0x00007fffff710aaa <+458>: vmovups -0x180(%rax),%xmm1
0x00007fffff710ab2 <+466>: vinsertf128 $0x1,-0x170(%rax),%ymm1,%ymm5
0x00007fffff710abc <+476>: vmovups -0x160(%rax),%xmm1
0x00007fffff710ac4 <+484>: vinsertf128 $0x1,-0x150(%rax),%ymm1,%ymm3
0x00007fffff710ace <+494>: vfmadd132ps %ymm5,%ymm9,%ymm12
0x00007fffff710ad3 <+499>: vbroadcastss -0x8(%r11),%ymm1
0x00007fffff710ad9 <+505>: vbroadcastss -0x8(%r10),%ymm5
0x00007fffff710adf <+511>: vfmadd132ps %ymm3,%ymm8,%ymm4
0x00007fffff710ae4 <+516>: vbroadcastss -0x8(%r12),%ymm3
0x00007fffff710aeb <+523>: vmovaps %ymm12,%ymm11
0x00007fffff710af0 <+528>: vmovaps %ymm4,%ymm10
0x00007fffff710af4 <+532>: vmovups -0x140(%rax),%xmm4
0x00007fffff710afc <+540>: vinsertf128 $0x1,-0x130(%rax),%ymm4,%ymm0
0x00007fffff710b06 <+550>: vbroadcastss -0x8(%r9),%ymm4
0x00007fffff710b0c <+556>: vfmadd132ps %ymm0,%ymm7,%ymm2
0x00007fffff710b11 <+561>: vbroadcastss -0x8(%rbx),%ymm0
0x00007fffff710b17 <+567>: vmovaps %ymm2,%ymm6
/usr/lib/gcc/x86_64-linux-gnu/9/include/avxintrin.h:
735 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
0x00007fffff710b1b <+571>: vbroadcastss -0x8(%rdx),%ymm2
0x00007fffff710b21 <+577>: vbroadcastss -0x8(%rsi),%ymm8
0x00007fffff710b27 <+583>: vmovups -0x120(%rax),%xmm13
0x00007fffff710b2f <+591>: vmovups -0x100(%rax),%xmm14
0x00007fffff710b37 <+599>: vinsertf128 $0x1,-0x110(%rax),%ymm13,%ymm12
0x00007fffff710b41 <+609>: vmovups -0xe0(%rax),%xmm15
0x00007fffff710b49 <+617>: vbroadcastss -0x8(%rdi),%ymm9
0x00007fffff710b4f <+623>: vbroadcastss -0x8(%rcx),%ymm7
/usr/lib/gcc/x86_64-linux-gnu/9/include/fmaintrin.h:
65 return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
0x00007fffff710b55 <+629>: vfmadd132ps %ymm12,%ymm11,%ymm0
0x00007fffff710b5a <+634>: vinsertf128 $0x1,-0xf0(%rax),%ymm14,%ymm11
0x00007fffff710b64 <+644>: vfmadd132ps %ymm11,%ymm10,%ymm3
0x00007fffff710b69 <+649>: vinsertf128 $0x1,-0xd0(%rax),%ymm15,%ymm10
0x00007fffff710b73 <+659>: vfmadd132ps %ymm10,%ymm6,%ymm1
0x00007fffff710b78 <+664>: vmovups -0xc0(%rax),%xmm6
0x00007fffff710b80 <+672>: vinsertf128 $0x1,-0xb0(%rax),%ymm6,%ymm6
0x00007fffff710b8a <+682>: vfmadd132ps %ymm6,%ymm0,%ymm5
0x00007fffff710b8f <+687>: vmovups -0xa0(%rax),%xmm6
0x00007fffff710b97 <+695>: vinsertf128 $0x1,-0x90(%rax),%ymm6,%ymm0
0x00007fffff710ba1 <+705>: vfmadd132ps %ymm0,%ymm3,%ymm4
0x00007fffff710ba6 <+710>: vmovups -0x80(%rax),%xmm3
0x00007fffff710bab <+715>: vmovaps %ymm9,%ymm0
0x00007fffff710baf <+719>: vinsertf128 $0x1,-0x70(%rax),%ymm3,%ymm6
0x00007fffff710bb6 <+726>: vmovups -0x40(%rax),%xmm3
0x00007fffff710bbb <+731>: vinsertf128 $0x1,-0x30(%rax),%ymm3,%ymm3
0x00007fffff710bc2 <+738>: vfmadd132ps %ymm6,%ymm1,%ymm2
0x00007fffff710bc7 <+743>: vmovups -0x60(%rax),%xmm1
0x00007fffff710bcc <+748>: vinsertf128 $0x1,-0x50(%rax),%ymm1,%ymm6
0x00007fffff710bd3 <+755>: vfmadd132ps %ymm6,%ymm5,%ymm0
0x00007fffff710bd8 <+760>: vfmadd132ps %ymm8,%ymm4,%ymm3
0x00007fffff710bdd <+765>: vmovups -0x20(%rax),%xmm4
0x00007fffff710be2 <+770>: vinsertf128 $0x1,-0x10(%rax),%ymm4,%ymm1
0x00007fffff710be9 <+777>: vfmadd132ps %ymm7,%ymm2,%ymm1
/home/tianzer/Anime4KCPP/Anime4KCore/src/CPUCNNProcessor.cpp:
390 for (int i = 0; i < 8; i += 2)
0x00007fffff710bee <+782>: cmp %rax,%r13
0x00007fffff710bf1 <+785>: jne 0x7fffff7109c3 <std::_Function_handler<void(int, int, float*, float*), Anime4KCPP::CPU::CNNProcessor::conv8To8(const FP*, const FP*, cv::Mat&)::<lambda(int, int, Anime4KCPP::CPU::ChanFP, Anime4KCPP::CPU::LineFP)> >::_M_invoke(const std::_Any_data &, int &&, int &&, float *&&, float *&&)+227>
/usr/lib/gcc/x86_64-linux-gnu/9/include/avxintrin.h:
1230 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
0x00007fffff710bf7 <+791>: vaddps %ymm3,%ymm0,%ymm0
0x00007fffff710bfb <+795>: vaddps %ymm1,%ymm0,%ymm0
0x00007fffff710bff <+799>: vxorps %xmm1,%xmm1,%xmm1
0x00007fffff710c03 <+803>: vmaxps %ymm1,%ymm0,%ymm0
904 *(__m256_u *)__P = __A;
0x00007fffff710c07 <+807>: vmovups %xmm0,(%r8)
0x00007fffff710c0c <+812>: vextractf128 $0x1,%ymm0,0x10(%r8)
0x00007fffff710c13 <+819>: vzeroupper
0x00007fffff710c16 <+822>: pop %rbx
0x00007fffff710c17 <+823>: pop %r12
0x00007fffff710c19 <+825>: pop %r13
0x00007fffff710c1b <+827>: pop %rbp
0x00007fffff710c1c <+828>: retq
if I use -march=native to build:
-g -fopenmp -lpthread -march=native -O3
/usr/lib/gcc/x86_64-linux-gnu/9/include/avxintrin.h:
898 return *(__m256_u *)__P;
0x00007fffff711596 <+134>: vxorps %xmm1,%xmm1,%xmm1
0x00007fffff71159a <+138>: lea 0x4(,%r10,4),%r13
0x00007fffff7115a2 <+146>: lea 0x4(,%rdi,4),%rdi
0x00007fffff7115aa <+154>: vmovaps %ymm1,%ymm2
0x00007fffff7115ae <+158>: mov (%rax),%rax
0x00007fffff7115b1 <+161>: lea 0x4(,%rsi,4),%rsi
0x00007fffff7115b9 <+169>: lea (%r11,%rdi,1),%rbx
0x00007fffff7115bd <+173>: lea (%r11,%rsi,1),%r12
0x00007fffff7115c1 <+177>: lea (%rdx,%rdi,1),%r10
0x00007fffff7115c5 <+181>: add %r13,%r11
0x00007fffff7115c8 <+184>: add %rcx,%rdi
0x00007fffff7115cb <+187>: vmovups (%rax),%ymm0
/home/tianzer/Anime4KCPP/Anime4KCore/src/CPUCNNProcessor.cpp:
390 for (int i = 0; i < 8; i += 2)
=> 0x00007fffff7115cf <+191>: mov 0x10(%r9),%rax
0x00007fffff7115d3 <+195>: lea (%rdx,%rsi,1),%r9
0x00007fffff7115d7 <+199>: add %r13,%rdx
0x00007fffff7115da <+202>: add %rcx,%rsi
0x00007fffff7115dd <+205>: add %r13,%rcx
0x00007fffff7115e0 <+208>: mov (%rax),%rax
0x00007fffff7115e3 <+211>: lea 0x900(%rax),%r13
/usr/lib/gcc/x86_64-linux-gnu/9/include/avxintrin.h:
735 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
0x00007fffff7115ea <+218>: vbroadcastss -0x4(%r11),%ymm4
0x00007fffff7115f0 <+224>: vbroadcastss -0x4(%rbx),%ymm3
0x00007fffff7115f6 <+230>: add $0x8,%r12
0x00007fffff7115fa <+234>: add $0x240,%rax
0x00007fffff711600 <+240>: vbroadcastss -0x4(%r10),%ymm11
0x00007fffff711606 <+246>: vbroadcastss -0x4(%r9),%ymm10
0x00007fffff71160c <+252>: add $0x8,%rbx
0x00007fffff711610 <+256>: add $0x8,%r11
0x00007fffff711614 <+260>: vbroadcastss -0x4(%rdx),%ymm9
0x00007fffff71161a <+266>: vbroadcastss -0x4(%rdi),%ymm8
0x00007fffff711620 <+272>: add $0x8,%r10
0x00007fffff711624 <+276>: add $0x8,%r9
0x00007fffff711628 <+280>: vbroadcastss -0xc(%r12),%ymm5
0x00007fffff71162f <+287>: vbroadcastss -0x4(%rsi),%ymm7
0x00007fffff711635 <+293>: add $0x8,%rdx
0x00007fffff711639 <+297>: add $0x8,%rdi
0x00007fffff71163d <+301>: vbroadcastss -0x4(%rcx),%ymm6
/usr/lib/gcc/x86_64-linux-gnu/9/include/fmaintrin.h:
65 return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
0x00007fffff711643 <+307>: add $0x8,%rsi
0x00007fffff711647 <+311>: add $0x8,%rcx
0x00007fffff71164b <+315>: vfmadd132ps -0x240(%rax),%ymm0,%ymm3
0x00007fffff711654 <+324>: vbroadcastss -0x8(%r10),%ymm0
0x00007fffff71165a <+330>: vfmadd132ps -0x220(%rax),%ymm2,%ymm5
0x00007fffff711663 <+339>: vbroadcastss -0x8(%rsi),%ymm2
0x00007fffff711669 <+345>: vfmadd231ps -0x200(%rax),%ymm4,%ymm1
0x00007fffff711672 <+354>: vbroadcastss -0x8(%rdx),%ymm4
0x00007fffff711678 <+360>: vfmadd132ps -0x1e0(%rax),%ymm3,%ymm11
0x00007fffff711681 <+369>: vbroadcastss -0x8(%rcx),%ymm3
0x00007fffff711687 <+375>: vfmadd132ps -0x1c0(%rax),%ymm5,%ymm10
0x00007fffff711690 <+384>: vbroadcastss -0x8(%r9),%ymm5
0x00007fffff711696 <+390>: vfmadd132ps -0x1a0(%rax),%ymm1,%ymm9
0x00007fffff71169f <+399>: vbroadcastss -0x8(%rdi),%ymm1
0x00007fffff7116a5 <+405>: vfmadd231ps -0x180(%rax),%ymm8,%ymm11
0x00007fffff7116ae <+414>: vbroadcastss -0x8(%rbx),%ymm8
0x00007fffff7116b4 <+420>: vfmadd231ps -0x160(%rax),%ymm7,%ymm10
0x00007fffff7116bd <+429>: vbroadcastss -0x8(%r12),%ymm7
0x00007fffff7116c4 <+436>: vfmadd231ps -0x140(%rax),%ymm6,%ymm9
/usr/lib/gcc/x86_64-linux-gnu/9/include/avxintrin.h:
735 return (__m256) __builtin_ia32_vbroadcastss256 (__X);
0x00007fffff7116cd <+445>: vbroadcastss -0x8(%r11),%ymm6
/usr/lib/gcc/x86_64-linux-gnu/9/include/fmaintrin.h:
65 return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
0x00007fffff7116d3 <+451>: vfmadd132ps -0x120(%rax),%ymm11,%ymm8
0x00007fffff7116dc <+460>: vfmadd132ps -0x100(%rax),%ymm10,%ymm7
0x00007fffff7116e5 <+469>: vfmadd132ps -0xe0(%rax),%ymm9,%ymm6
0x00007fffff7116ee <+478>: vfmadd132ps -0xc0(%rax),%ymm8,%ymm0
0x00007fffff7116f7 <+487>: vfmadd132ps -0xa0(%rax),%ymm7,%ymm5
0x00007fffff711700 <+496>: vfmadd132ps -0x80(%rax),%ymm6,%ymm4
0x00007fffff711706 <+502>: vfmadd132ps -0x20(%rax),%ymm4,%ymm3
0x00007fffff71170c <+508>: vfmadd231ps -0x60(%rax),%ymm1,%ymm0
0x00007fffff711712 <+514>: vfmadd132ps -0x40(%rax),%ymm5,%ymm2
0x00007fffff711718 <+520>: vmovaps %ymm3,%ymm1
/home/tianzer/Anime4KCPP/Anime4KCore/src/CPUCNNProcessor.cpp:
390 for (int i = 0; i < 8; i += 2)
0x00007fffff71171c <+524>: cmp %rax,%r13
0x00007fffff71171f <+527>: jne 0x7fffff7115ea <std::_Function_handler<void(int, int, float*, float*), Anime4KCPP::CPU::CNNProcessor::conv8To8(const FP*, const FP*, cv::Mat&)::<lambda(int, int, Anime4KCPP::CPU::ChanFP, Anime4KCPP::CPU::LineFP)> >::_M_invoke(const std::_Any_data &, int &&, int &&, float *&&, float *&&)+218>
/usr/lib/gcc/x86_64-linux-gnu/9/include/avxintrin.h:
1230 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
0x00007fffff711725 <+533>: vaddps %ymm2,%ymm0,%ymm0
0x00007fffff711729 <+537>: vxorps %xmm1,%xmm1,%xmm1
0x00007fffff71172d <+541>: vaddps %ymm3,%ymm0,%ymm0
0x00007fffff711731 <+545>: vmaxps %ymm1,%ymm0,%ymm0
904 *(__m256_u *)__P = __A;
0x00007fffff711735 <+549>: vmovups %ymm0,(%r8)
0x00007fffff71173a <+554>: vzeroupper
0x00007fffff71173d <+557>: pop %rbx
0x00007fffff71173e <+558>: pop %r12
0x00007fffff711740 <+560>: pop %r13
0x00007fffff711742 <+562>: pop %rbp
0x00007fffff711743 <+563>: retq
Benchmark results from Intel i3 4170, the score is the reciprocal of the processing time multiplied by a factor, use the bin from gcc, which is consistent with the disassembly above. results of MSVC are almost the same:
ordinary C++ code: 4.13368
-mavx2 -mfma: 2.51132
-march=native: 2.46779
I noticed that under -march=native
compilation, vfmadd231ps fetches operands directly from memory. Is it because Intel's L2 is not big enough? Ryzen's L2 per core is twice that of Intel.