3

I need to implement fast bivariate polynomial evaluation (for a polynomial whose size is fixed at compile time). I came up with the following example program:

#include <cmath>
#include <array>
#include <iostream>

int main()
{
    constexpr size_t NX = 5, NY = 4;
    using XA = std::array< double, NX >;
    using YA = std::array< XA, NY >;
    YA uu{};

    for(size_t yi = 0; yi < NY; yi++) {
        for(size_t xi = 0; xi < NX; xi++) {
            uu[yi][xi] = xi + yi;
            std::cerr << "uu["<< yi << ","<< xi << "] = " << uu[yi][xi] << '\n';
        }
    }

    double sum{0}, x = rand(), y = rand();
    for(auto iy = uu.rbegin(); iy != uu.rend(); iy++) {

        auto ix = iy->rbegin();
        double res = *ix++;
        for(; ix != iy->rend(); ix++) {
            res = std::fma(res, x, *ix);
        }
        sum = std::fma(sum, y, res);
    }
    std::cerr << "XXXX: " << sum << '\n';

    return 0;
}

When I compile it with GCC 8.1 and -mfma -msse4.2 -O3 -DNDEBUG, I get pretty decent optimization:

.text:0000000000402D2A                 call    rand
.text:0000000000402D2F                 vxorpd  xmm6, xmm6, xmm6
.text:0000000000402D33                 vcvtsi2sd xmm6, xmm6, eax
.text:0000000000402D37                 call    rand
.text:0000000000402D3C                 mov     rcx, cs:_refptr__ZSt4cerr
.text:0000000000402D43                 vxorpd  xmm1, xmm1, xmm1
.text:0000000000402D47                 mov     r8d, 6
.text:0000000000402D4D                 vcvtsi2sd xmm1, xmm1, eax
.text:0000000000402D51                 vmovsd  xmm2, [rsp+128h+var_D8]
.text:0000000000402D57                 vfmadd213sd xmm2, xmm6, [rsp+128h+var_E0]
.text:0000000000402D5E                 vfmadd213sd xmm2, xmm6, [rsp+128h+var_E8]
.text:0000000000402D65                 lea     rdx, aXxxx      ; "XXXX: "
.text:0000000000402D6C                 vfmadd213sd xmm2, xmm6, [rsp+128h+var_F0]
.text:0000000000402D73                 vmovsd  xmm0, [rsp+128h+var_B0]
.text:0000000000402D79                 vfmadd213sd xmm2, xmm6, [rsp+128h+var_F8]
.text:0000000000402D80                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_B8]
.text:0000000000402D87                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_C0]
.text:0000000000402D8E                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_C8]
.text:0000000000402D95                 vmovapd xmm3, xmm0
.text:0000000000402D99                 vmovsd  xmm0, [rsp+128h+var_88]
.text:0000000000402DA2                 vfmadd213sd xmm3, xmm6, [rsp+128h+var_D0]
.text:0000000000402DA9                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_90]
.text:0000000000402DB3                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_98]
.text:0000000000402DBD                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_A0]
.text:0000000000402DC7                 vmovapd xmm4, xmm0
.text:0000000000402DCB                 vmovsd  xmm0, [rsp+128h+var_60]
.text:0000000000402DD4                 vfmadd213sd xmm4, xmm6, [rsp+128h+var_A8]
.text:0000000000402DDE                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_68]
.text:0000000000402DE8                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_70]
.text:0000000000402DF2                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_78]
.text:0000000000402DFC                 vfmadd213sd xmm0, xmm6, [rsp+128h+var_80]
.text:0000000000402E06                 vfmadd231sd xmm0, xmm1, cs:qword_404018
.text:0000000000402E0F                 vfmadd132sd xmm0, xmm4, xmm1
.text:0000000000402E14                 vfmadd132sd xmm0, xmm3, xmm1
.text:0000000000402E19                 vfmadd132sd xmm1, xmm2, xmm0
.text:0000000000402E1E                 vmovapd xmm6, xmm1
.text:0000000000402E22                 call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x ; std::__ostream_insert<char,std::char_traits<char>>(std::basic_ostream<char,std::char_traits<char>> &,char const*,long long)

But when I compile with MSVC 2019 and flags

/W3 /GR /EHsc /Ox /MD /Ob2 /fp:fast /GL /arch:AVX2

I get the following:

.text:000000014000218B                 add     rsi, 8
.text:000000014000218F                 cmp     rdi, 5
.text:0000000140002193                 jb      loc_140002110
.text:0000000140002199                 inc     rbx
.text:000000014000219C                 cmp     rbx, 4
.text:00000001400021A0                 jb      loc_140002100
.text:00000001400021A6                 vxorpd  xmm6, xmm6, xmm6
.text:00000001400021AA                 call    cs:rand
.text:00000001400021B0                 vxorps  xmm7, xmm7, xmm7
.text:00000001400021B4                 vcvtsi2sd xmm7, xmm7, eax
.text:00000001400021B8                 call    cs:rand
.text:00000001400021BE                 vxorps  xmm2, xmm2, xmm2
.text:00000001400021C2                 vcvtsi2sd xmm2, xmm2, eax
.text:00000001400021C6                 lea     rcx, [rsp+0F8h+var_40]
.text:00000001400021CE                 xchg    ax, ax
.text:00000001400021D0
.text:00000001400021D0 loc_1400021D0:                          ; CODE XREF: main+15B↓j
.text:00000001400021D0                 vmovsd  xmm1, qword ptr [rcx]
.text:00000001400021D4                 lea     rdx, [rcx-20h]
.text:00000001400021D8                 mov     rax, rcx
.text:00000001400021DB                 cmp     rcx, rdx
.text:00000001400021DE                 jz      short loc_1400021F6
.text:00000001400021E0
.text:00000001400021E0 loc_1400021E0:                          ; CODE XREF: main+144↓j
.text:00000001400021E0                 add     rax, 0FFFFFFFFFFFFFFF8h
.text:00000001400021E4                 vmovaps xmm0, xmm7
.text:00000001400021E8                 vfmadd213sd xmm0, xmm1, qword ptr [rax]
.text:00000001400021ED                 vmovapd xmm1, xmm0
.text:00000001400021F1                 cmp     rax, rdx
.text:00000001400021F4                 jnz     short loc_1400021E0
.text:00000001400021F6
.text:00000001400021F6 loc_1400021F6:                          ; CODE XREF: main+12E↑j
.text:00000001400021F6                 sub     rcx, 28h ; '('
.text:00000001400021FA                 lea     rdx, [rsp+0F8h+var_D8]
.text:00000001400021FF                 vfmadd213sd xmm6, xmm2, xmm1
.text:0000000140002204                 lea     rax, [rcx+8]
.text:0000000140002208                 cmp     rax, rdx
.text:000000014000220B                 jnz     short loc_1400021D0
.text:000000014000220D                 mov     rcx, cs:?cerr@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A ; std::basic_ostream<char,std::char_traits<char>> std::cerr
.text:0000000140002214                 lea     rdx, aXxxx      ; "XXXX: "
.text:000000014000221B                 call    sub_140001080

So, MSVC even is not able to unroll loops which go over the elements of std::array? Or perhaps am I missing some optimization options ?

pem
  • 365
  • 2
  • 12
  • *produce fma intrinsics* - Huh? MSVC compiles to machine code, not to C with intrinsics. Probably you meant FMA *instructions*, but do any of those options tell it to assume the target machine can use FMA instructions? It looks like no, since that would imply AVX support but it's not using VEX instructions (like `vmulsd` instead of legacy SSE `mulsd`). IDK if `/arch:AVX2` implies FMA; there is at least one Via CPU with AVX2 but not FMA, but it's not widely used *at all*. – Peter Cordes Aug 16 '22 at 10:33
  • @PeterCordes: you are right, when I add ```/arch:AVX2``` compiler option, MSVC compiles ```std::fma``` into appropriate FMA intructions.. But, still, it's not even able to unroll the loop which goes over the elements of std::array ?? – pem Aug 16 '22 at 11:52
  • I am not sure why my question is marked as duplicate ? It's mainly not about FMA but about general MSVC optimization abilities – pem Aug 16 '22 at 11:54
  • 1
    The one thing you mentioned in the text of your question was use of FMA instructions. Anyway, unrolling doesn't seem super important for a loop that bottlenecks on FMA latency, although it is relatively short so unrolling could let it overlap better with surrounding code. And MSVC looks to be wasting instructions; I don't think both `vmovapd` instructions are necessary. Anyway, basically just a tuning choice, to unroll or not. GCC happens to be more aggressive. MSVC is known to be less good at optimizing in general. https://www.agner.org/optimize/blog/read.php?i=1015 – Peter Cordes Aug 16 '22 at 12:14

0 Answers0