I've been writing some basic functions using GCC's asm
to practice for an actual application.
My functions pretty
, wrap
, and pure
generate the same instructions to unpack a 64 bit integer into a 128 bit vector. add1
and add2
which call pretty
and wrap
respectively also generate the same instructions. But add3
differs by saving its xmm0
register by pushing it to the stack rather than by copying it to another xmm
register. This I don't understand because the compiler can see the details of pure
to know none of the other xmm
registers will be clobbered.
Here is the C++
#include <immintrin.h>
__m128i pretty(long long b) { return (__m128i){b,b}; }
__m128i wrap(long long b) {
asm ("mov qword ptr [rsp-0x10], rdi\n"
"vmovddup xmm0, qword ptr [rsp-0x10]\n"
:
: "r"(b)
);
}
extern "C" __m128i pure(long long b);
asm (".text\n.global pure\n\t.type pure, @function\n"
"pure:\n\t"
"mov qword ptr [rsp-0x10], rdi\n\t"
"vmovddup xmm0, qword ptr [rsp-0x10]\n\t"
"ret\n\t"
);
__m128i add1(__m128i in, long long in2) { return in + pretty(in2);}
__m128i add2(__m128i in, long long in2) { return in + wrap(in2);}
__m128i add3(__m128i in, long long in2) { return in + pure(in2);}
Compiled with g++ -c so.cpp -march=native -masm=intel -O3 -fno-inline
and disassembled with objdump -d -M intel so.o | c++filt
.
so.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <pure>:
0: 48 89 7c 24 f0 mov QWORD PTR [rsp-0x10],rdi
5: c5 fb 12 44 24 f0 vmovddup xmm0,QWORD PTR [rsp-0x10]
b: c3 ret
c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
0000000000000010 <pretty(long long)>:
10: 48 89 7c 24 f0 mov QWORD PTR [rsp-0x10],rdi
15: c5 fb 12 44 24 f0 vmovddup xmm0,QWORD PTR [rsp-0x10]
1b: c3 ret
1c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
0000000000000020 <wrap(long long)>:
20: 48 89 7c 24 f0 mov QWORD PTR [rsp-0x10],rdi
25: c5 fb 12 44 24 f0 vmovddup xmm0,QWORD PTR [rsp-0x10]
2b: c3 ret
2c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
0000000000000030 <add1(long long __vector(2), long long)>:
30: c5 f8 28 c8 vmovaps xmm1,xmm0
34: 48 83 ec 08 sub rsp,0x8
38: e8 00 00 00 00 call 3d <add1(long long __vector(2), long long)+0xd>
3d: 48 83 c4 08 add rsp,0x8
41: c5 f9 d4 c1 vpaddq xmm0,xmm0,xmm1
45: c3 ret
46: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
4d: 00 00 00
0000000000000050 <add2(long long __vector(2), long long)>:
50: c5 f8 28 c8 vmovaps xmm1,xmm0
54: 48 83 ec 08 sub rsp,0x8
58: e8 00 00 00 00 call 5d <add2(long long __vector(2), long long)+0xd>
5d: 48 83 c4 08 add rsp,0x8
61: c5 f9 d4 c1 vpaddq xmm0,xmm0,xmm1
65: c3 ret
66: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
6d: 00 00 00
0000000000000070 <add3(long long __vector(2), long long)>:
70: 48 83 ec 18 sub rsp,0x18
74: c5 f8 29 04 24 vmovaps XMMWORD PTR [rsp],xmm0
79: e8 00 00 00 00 call 7e <add3(long long __vector(2), long long)+0xe>
7e: c5 f9 d4 04 24 vpaddq xmm0,xmm0,XMMWORD PTR [rsp]
83: 48 83 c4 18 add rsp,0x18
87: c3 ret