I have created a simple, vectorized C function to square each element in an array. The code is as follows:
#include <immintrin.h>
void square(const double* arr, uint len, double* outarr) {
__m256d v;
for (uint i = 0; i <= len - 4; i += 4) {
v = _mm256_load_pd(&arr[i]);
_mm256_stream_pd(&outarr[i], _mm256_mul_pd(v, v));
}
for (uint i = len-(len&3u); i < len; i++) {
outarr[i] = arr[i]*arr[i];
}
}
int main() {
double* inp = aligned_alloc(32, 100* sizeof(double));
double* out = aligned_alloc(32, 100* sizeof(double));
square(inp, 100u, out);
return 0;
}
When I compile this code with:
gcc main.c -mavx -o main
I get the following disassembly for the square function:
0x0000000000400546 <+0>: lea 0x8(%rsp),%r10
0x000000000040054b <+5>: and $0xffffffffffffffe0,%rsp
0x000000000040054f <+9>: pushq -0x8(%r10)
0x0000000000400553 <+13>: push %rbp
0x0000000000400554 <+14>: mov %rsp,%rbp
0x0000000000400557 <+17>: push %r10
0x0000000000400559 <+19>: sub $0x50,%rsp
0x000000000040055d <+23>: mov %rdi,-0xb8(%rbp)
0x0000000000400564 <+30>: mov %esi,-0xbc(%rbp)
0x000000000040056a <+36>: mov %rdx,-0xc8(%rbp)
0x0000000000400571 <+43>: movl $0x0,-0xa8(%rbp)
0x000000000040057b <+53>: jmpq 0x400615 <square+207>
0x0000000000400580 <+58>: mov -0xa8(%rbp),%eax
0x0000000000400586 <+64>: cltq
0x0000000000400588 <+66>: lea 0x0(,%rax,8),%rdx
0x0000000000400590 <+74>: mov -0xb8(%rbp),%rax
0x0000000000400597 <+81>: add %rdx,%rax
0x000000000040059a <+84>: mov %rax,-0xa0(%rbp)
0x00000000004005a1 <+91>: mov -0xa0(%rbp),%rax
0x00000000004005a8 <+98>: vmovapd (%rax),%ymm0
0x00000000004005ac <+102>: vmovapd %ymm0,-0x90(%rbp)
0x00000000004005b4 <+110>: vmovapd -0x90(%rbp),%ymm0
0x00000000004005bc <+118>: vmovapd %ymm0,-0x70(%rbp)
0x00000000004005c1 <+123>: vmovapd -0x90(%rbp),%ymm0
0x00000000004005c9 <+131>: vmovapd %ymm0,-0x30(%rbp)
0x00000000004005ce <+136>: vmovapd -0x70(%rbp),%ymm0
0x00000000004005d3 <+141>: vmulpd -0x30(%rbp),%ymm0,%ymm0
0x00000000004005d8 <+146>: mov -0xa8(%rbp),%eax
0x00000000004005de <+152>: cltq
0x00000000004005e0 <+154>: lea 0x0(,%rax,8),%rdx
0x00000000004005e8 <+162>: mov -0xc8(%rbp),%rax
0x00000000004005ef <+169>: add %rdx,%rax
0x00000000004005f2 <+172>: mov %rax,-0x98(%rbp)
0x00000000004005f9 <+179>: vmovapd %ymm0,-0x50(%rbp)
0x00000000004005fe <+184>: mov -0x98(%rbp),%rax
0x0000000000400605 <+191>: vmovapd -0x50(%rbp),%ymm0
0x000000000040060a <+196>: vmovntpd %ymm0,(%rax)
0x000000000040060e <+200>: addl $0x4,-0xa8(%rbp)
0x0000000000400615 <+207>: mov -0xbc(%rbp),%eax
0x000000000040061b <+213>: lea -0x4(%rax),%edx
0x000000000040061e <+216>: mov -0xa8(%rbp),%eax
0x0000000000400624 <+222>: cmp %eax,%edx
0x0000000000400626 <+224>: jae 0x400580 <square+58>
0x000000000040062c <+230>: mov -0xbc(%rbp),%eax
0x0000000000400632 <+236>: and $0xfffffffc,%eax
0x0000000000400635 <+239>: mov %eax,-0xa4(%rbp)
0x000000000040063b <+245>: jmp 0x40069c <square+342>
0x000000000040063d <+247>: mov -0xa4(%rbp),%eax
0x0000000000400643 <+253>: lea 0x0(,%rax,8),%rdx
0x000000000040064b <+261>: mov -0xc8(%rbp),%rax
0x0000000000400652 <+268>: add %rdx,%rax
Then, when I compile the code with:
gcc main.c -mavx -O3 -o main
I get the following disassembly for the square function:
0x00000000004005c0 <+0>: lea -0x4(%rsi),%r9d
0x00000000004005c4 <+4>: mov %rdi,%r8
0x00000000004005c7 <+7>: mov %rdx,%rcx
0x00000000004005ca <+10>: xor %eax,%eax
0x00000000004005cc <+12>: nopl 0x0(%rax)
0x00000000004005d0 <+16>: vmovapd (%r8),%ymm0
0x00000000004005d5 <+21>: add $0x4,%eax
0x00000000004005d8 <+24>: add $0x20,%r8
0x00000000004005dc <+28>: add $0x20,%rcx
0x00000000004005e0 <+32>: vmulpd %ymm0,%ymm0,%ymm0
0x00000000004005e4 <+36>: vmovntpd %ymm0,-0x20(%rcx)
0x00000000004005e9 <+41>: cmp %eax,%r9d
0x00000000004005ec <+44>: jae 0x4005d0 <square+16>
0x00000000004005ee <+46>: mov %esi,%eax
0x00000000004005f0 <+48>: and $0xfffffffc,%eax
0x00000000004005f3 <+51>: cmp %eax,%esi
0x00000000004005f5 <+53>: jbe 0x400617 <square+87>
0x00000000004005f7 <+55>: nopw 0x0(%rax,%rax,1)
0x0000000000400600 <+64>: mov %eax,%ecx
0x0000000000400602 <+66>: add $0x1,%eax
0x0000000000400605 <+69>: vmovsd (%rdi,%rcx,8),%xmm0
0x000000000040060a <+74>: cmp %eax,%esi
0x000000000040060c <+76>: vmulsd %xmm0,%xmm0,%xmm0
0x0000000000400610 <+80>: vmovsd %xmm0,(%rdx,%rcx,8)
0x0000000000400615 <+85>: jne 0x400600 <square+64>
0x0000000000400617 <+87>: vzeroupper
0x000000000040061a <+90>: retq
My GCC version is:
gcc (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
I was just wondering if someone could explain what GCC is doing in the first code snippet. There seems to be a bunch of random vmovapd
instructions with no particular purpose. I.e instruction <+110>
seems to be useless as the contents of %ymm0
are the same as -0x90(%rbp)
? I can understand what the -O3
code does but I'm confused about the non-optimized code. Thank you.