GCC -O0 Generating Weird AVX extra store/reload instructions with intrinsics

Question

I have created a simple, vectorized C function to square each element in an array. The code is as follows:

#include <immintrin.h>

void square(const double* arr, uint len, double* outarr) {
    __m256d v;

    for (uint i = 0; i <= len - 4; i += 4) {
        v = _mm256_load_pd(&arr[i]);
        _mm256_stream_pd(&outarr[i], _mm256_mul_pd(v, v));
    }

    for (uint i = len-(len&3u); i < len; i++) {
        outarr[i] = arr[i]*arr[i];
    }
}

int main() {
    double* inp = aligned_alloc(32, 100* sizeof(double));
    double* out = aligned_alloc(32, 100* sizeof(double));
    square(inp, 100u, out);
    return 0;
}

When I compile this code with:

gcc main.c -mavx -o main

I get the following disassembly for the square function:

   0x0000000000400546 <+0>:     lea    0x8(%rsp),%r10
   0x000000000040054b <+5>:     and    $0xffffffffffffffe0,%rsp
   0x000000000040054f <+9>:     pushq  -0x8(%r10)
   0x0000000000400553 <+13>:    push   %rbp
   0x0000000000400554 <+14>:    mov    %rsp,%rbp
   0x0000000000400557 <+17>:    push   %r10
   0x0000000000400559 <+19>:    sub    $0x50,%rsp
   0x000000000040055d <+23>:    mov    %rdi,-0xb8(%rbp)
   0x0000000000400564 <+30>:    mov    %esi,-0xbc(%rbp)
   0x000000000040056a <+36>:    mov    %rdx,-0xc8(%rbp)
   0x0000000000400571 <+43>:    movl   $0x0,-0xa8(%rbp)
   0x000000000040057b <+53>:    jmpq   0x400615 <square+207>
   0x0000000000400580 <+58>:    mov    -0xa8(%rbp),%eax
   0x0000000000400586 <+64>:    cltq   
   0x0000000000400588 <+66>:    lea    0x0(,%rax,8),%rdx
   0x0000000000400590 <+74>:    mov    -0xb8(%rbp),%rax
   0x0000000000400597 <+81>:    add    %rdx,%rax
   0x000000000040059a <+84>:    mov    %rax,-0xa0(%rbp)
   0x00000000004005a1 <+91>:    mov    -0xa0(%rbp),%rax
   0x00000000004005a8 <+98>:    vmovapd (%rax),%ymm0
   0x00000000004005ac <+102>:   vmovapd %ymm0,-0x90(%rbp)
   0x00000000004005b4 <+110>:   vmovapd -0x90(%rbp),%ymm0
   0x00000000004005bc <+118>:   vmovapd %ymm0,-0x70(%rbp)
   0x00000000004005c1 <+123>:   vmovapd -0x90(%rbp),%ymm0
   0x00000000004005c9 <+131>:   vmovapd %ymm0,-0x30(%rbp)
   0x00000000004005ce <+136>:   vmovapd -0x70(%rbp),%ymm0
   0x00000000004005d3 <+141>:   vmulpd -0x30(%rbp),%ymm0,%ymm0
   0x00000000004005d8 <+146>:   mov    -0xa8(%rbp),%eax
   0x00000000004005de <+152>:   cltq   
   0x00000000004005e0 <+154>:   lea    0x0(,%rax,8),%rdx
   0x00000000004005e8 <+162>:   mov    -0xc8(%rbp),%rax
   0x00000000004005ef <+169>:   add    %rdx,%rax
   0x00000000004005f2 <+172>:   mov    %rax,-0x98(%rbp)
   0x00000000004005f9 <+179>:   vmovapd %ymm0,-0x50(%rbp)
   0x00000000004005fe <+184>:   mov    -0x98(%rbp),%rax
   0x0000000000400605 <+191>:   vmovapd -0x50(%rbp),%ymm0
   0x000000000040060a <+196>:   vmovntpd %ymm0,(%rax)
   0x000000000040060e <+200>:   addl   $0x4,-0xa8(%rbp)
   0x0000000000400615 <+207>:   mov    -0xbc(%rbp),%eax
   0x000000000040061b <+213>:   lea    -0x4(%rax),%edx
   0x000000000040061e <+216>:   mov    -0xa8(%rbp),%eax
   0x0000000000400624 <+222>:   cmp    %eax,%edx
   0x0000000000400626 <+224>:   jae    0x400580 <square+58>
   0x000000000040062c <+230>:   mov    -0xbc(%rbp),%eax
   0x0000000000400632 <+236>:   and    $0xfffffffc,%eax
   0x0000000000400635 <+239>:   mov    %eax,-0xa4(%rbp)
   0x000000000040063b <+245>:   jmp    0x40069c <square+342>
   0x000000000040063d <+247>:   mov    -0xa4(%rbp),%eax
   0x0000000000400643 <+253>:   lea    0x0(,%rax,8),%rdx
   0x000000000040064b <+261>:   mov    -0xc8(%rbp),%rax
   0x0000000000400652 <+268>:   add    %rdx,%rax

Then, when I compile the code with:

gcc main.c -mavx -O3 -o main

I get the following disassembly for the square function:

   0x00000000004005c0 <+0>:     lea    -0x4(%rsi),%r9d
   0x00000000004005c4 <+4>:     mov    %rdi,%r8
   0x00000000004005c7 <+7>:     mov    %rdx,%rcx
   0x00000000004005ca <+10>:    xor    %eax,%eax
   0x00000000004005cc <+12>:    nopl   0x0(%rax)
   0x00000000004005d0 <+16>:    vmovapd (%r8),%ymm0
   0x00000000004005d5 <+21>:    add    $0x4,%eax
   0x00000000004005d8 <+24>:    add    $0x20,%r8
   0x00000000004005dc <+28>:    add    $0x20,%rcx
   0x00000000004005e0 <+32>:    vmulpd %ymm0,%ymm0,%ymm0
   0x00000000004005e4 <+36>:    vmovntpd %ymm0,-0x20(%rcx)
   0x00000000004005e9 <+41>:    cmp    %eax,%r9d
   0x00000000004005ec <+44>:    jae    0x4005d0 <square+16>
   0x00000000004005ee <+46>:    mov    %esi,%eax
   0x00000000004005f0 <+48>:    and    $0xfffffffc,%eax
   0x00000000004005f3 <+51>:    cmp    %eax,%esi
   0x00000000004005f5 <+53>:    jbe    0x400617 <square+87>
   0x00000000004005f7 <+55>:    nopw   0x0(%rax,%rax,1)
   0x0000000000400600 <+64>:    mov    %eax,%ecx
   0x0000000000400602 <+66>:    add    $0x1,%eax
   0x0000000000400605 <+69>:    vmovsd (%rdi,%rcx,8),%xmm0
   0x000000000040060a <+74>:    cmp    %eax,%esi
   0x000000000040060c <+76>:    vmulsd %xmm0,%xmm0,%xmm0
   0x0000000000400610 <+80>:    vmovsd %xmm0,(%rdx,%rcx,8)
   0x0000000000400615 <+85>:    jne    0x400600 <square+64>
   0x0000000000400617 <+87>:    vzeroupper 
   0x000000000040061a <+90>:    retq

My GCC version is:

gcc (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

I was just wondering if someone could explain what GCC is doing in the first code snippet. There seems to be a bunch of random vmovapd instructions with no particular purpose. I.e instruction <+110> seems to be useless as the contents of %ymm0 are the same as -0x90(%rbp)? I can understand what the -O3 code does but I'm confused about the non-optimized code. Thank you.

At -O0, every variable is on the stack. This includes v, the 2 parameters of _mm256_mul_pd, etc. The vmovapd are just copying between those variables. — Marc Glisse, May 07 '20 at 23:13
Yeah there's a bunch of stupid that's required at O0 in order for debuggers to be able to show the contents of variables. — Zan Lynx, May 07 '20 at 23:16
Ohhhhhhhhhh wow I can't believe I missed that. Yeah that make a lot more sense. Thanks guys! — Cristian Bicheru, May 07 '20 at 23:24
To get reasonable performance and a debuggable assembly, I suggest trying `-Og` instead of `-O0`: https://godbolt.org/z/BELHCo — chtz, May 07 '20 at 23:25
@chtz debuggable C not assembly. Assembly is always "debuggable" — 0___________, May 08 '20 at 00:22
Duplicate of [Why does clang produce inefficient asm with -O0 (for this simple floating point sum)?](https://stackoverflow.com/q/53366394) which goes into more detail of @MarcGlisse's point that literally every variable is treated kind of like `volatile` at `-O0` — Peter Cordes, May 08 '20 at 02:30
@P__J__ right (I knew my wording was not quite accurate ...) — chtz, May 08 '20 at 07:18

GCC -O0 Generating Weird AVX extra store/reload instructions with intrinsics

0 Answers0