Here is the code :
unsigned int factorial(unsigned int n) {
if (n == 0) return 1;
return n * factorial(n - 1);
}
int main() {
return factorial(0);
}
I generate both gcc and clang assemblies using
g++ -g -O2 -c factorial.cpp (resp clang++)
objdump -d -M intel -S factorial.o
Here is what I get for gcc
factorial.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
if (n == 0)
0: 85 ff test edi,edi
2: b8 01 00 00 00 mov eax,0x1
7: 74 11 je 1a <_Z9factorialj+0x1a>
9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
10: 0f af c7 imul eax,edi
13: 83 ef 01 sub edi,0x1
16: 75 f8 jne 10 <_Z9factorialj+0x10>
18: f3 c3 repz ret
{
return 1;
}
return n * factorial(n - 1);
}
1a: f3 c3 repz ret
Disassembly of section .text.startup:
0000000000000000 <main>:
if (n == 0)
0: b8 01 00 00 00 mov eax,0x1
5: c3 ret
and for clang
factorial.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
0: b8 01 00 00 00 mov eax,0x1
if (n == 0)
5: 85 ff test edi,edi
7: 0f 84 ba 01 00 00 je 1c7 <_Z9factorialj+0x1c7>
d: b8 01 00 00 00 mov eax,0x1
{
return 1;
}
return n * factorial(n - 1);
12: 83 ff 08 cmp edi,0x8
15: 0f 82 a5 01 00 00 jb 1c0 <_Z9factorialj+0x1c0>
1b: 41 89 f8 mov r8d,edi
1e: 41 83 e0 f8 and r8d,0xfffffff8
22: 89 fa mov edx,edi
24: 83 e2 f8 and edx,0xfffffff8
27: 0f 84 93 01 00 00 je 1c0 <_Z9factorialj+0x1c0>
2d: 8d 4f f8 lea ecx,[rdi-0x8]
30: 89 c8 mov eax,ecx
32: c1 e8 03 shr eax,0x3
35: 0f ba e1 03 bt ecx,0x3
39: 72 24 jb 5f <_Z9factorialj+0x5f>
3b: 66 0f 6e c7 movd xmm0,edi
3f: 66 0f 70 c8 00 pshufd xmm1,xmm0,0x0
44: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4c <_Z9factorialj+0x4c>
4b: 00
4c: 66 0f fe c1 paddd xmm0,xmm1
50: 66 0f fe 0d 00 00 00 paddd xmm1,XMMWORD PTR [rip+0x0] # 58 <_Z9factorialj+0x58>
57: 00
58: b9 08 00 00 00 mov ecx,0x8
5d: eb 0e jmp 6d <_Z9factorialj+0x6d>
5f: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 67 <_Z9factorialj+0x67>
66: 00
67: 31 c9 xor ecx,ecx
69: 66 0f 6f c8 movdqa xmm1,xmm0
6d: 85 c0 test eax,eax
6f: 0f 84 d4 00 00 00 je 149 <_Z9factorialj+0x149>
75: 89 d0 mov eax,edx
77: 29 c8 sub eax,ecx
79: 89 fe mov esi,edi
7b: 29 ce sub esi,ecx
7d: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] # 85 <_Z9factorialj+0x85>
84: 00
85: 66 0f 6f 1d 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0] # 8d <_Z9factorialj+0x8d>
8c: 00
8d: 0f 1f 00 nop DWORD PTR [rax]
90: 66 0f 6e e6 movd xmm4,esi
94: 66 0f 70 e4 00 pshufd xmm4,xmm4,0x0
99: 66 0f 6f ec movdqa xmm5,xmm4
9d: 66 0f fe ea paddd xmm5,xmm2
a1: 66 0f fe e3 paddd xmm4,xmm3
a5: 66 0f 70 f5 f5 pshufd xmm6,xmm5,0xf5
aa: 66 0f f4 e8 pmuludq xmm5,xmm0
ae: 66 0f 70 ed e8 pshufd xmm5,xmm5,0xe8
b3: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
b8: 66 0f f4 c6 pmuludq xmm0,xmm6
bc: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
c1: 66 0f 62 e8 punpckldq xmm5,xmm0
c5: 66 0f 70 c4 f5 pshufd xmm0,xmm4,0xf5
ca: 66 0f f4 e1 pmuludq xmm4,xmm1
ce: 66 0f 70 e4 e8 pshufd xmm4,xmm4,0xe8
d3: 66 0f 70 c9 f5 pshufd xmm1,xmm1,0xf5
d8: 66 0f f4 c8 pmuludq xmm1,xmm0
dc: 66 0f 70 c1 e8 pshufd xmm0,xmm1,0xe8
e1: 66 0f 62 e0 punpckldq xmm4,xmm0
e5: 8d 4e f8 lea ecx,[rsi-0x8]
e8: 66 0f 6e c1 movd xmm0,ecx
ec: 66 0f 70 c8 00 pshufd xmm1,xmm0,0x0
f1: 66 0f 6f c1 movdqa xmm0,xmm1
f5: 66 0f fe c2 paddd xmm0,xmm2
f9: 66 0f fe cb paddd xmm1,xmm3
fd: 66 0f 70 f0 f5 pshufd xmm6,xmm0,0xf5
102: 66 0f f4 c5 pmuludq xmm0,xmm5
106: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
10b: 66 0f 70 ed f5 pshufd xmm5,xmm5,0xf5
110: 66 0f f4 ee pmuludq xmm5,xmm6
114: 66 0f 70 ed e8 pshufd xmm5,xmm5,0xe8
119: 66 0f 62 c5 punpckldq xmm0,xmm5
11d: 66 0f 70 e9 f5 pshufd xmm5,xmm1,0xf5
122: 66 0f f4 cc pmuludq xmm1,xmm4
126: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
12b: 66 0f 70 e4 f5 pshufd xmm4,xmm4,0xf5
130: 66 0f f4 e5 pmuludq xmm4,xmm5
134: 66 0f 70 e4 e8 pshufd xmm4,xmm4,0xe8
139: 66 0f 62 cc punpckldq xmm1,xmm4
13d: 83 c6 f0 add esi,0xfffffff0
140: 83 c0 f0 add eax,0xfffffff0
143: 0f 85 47 ff ff ff jne 90 <_Z9factorialj+0x90>
149: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
14e: 66 0f f4 c8 pmuludq xmm1,xmm0
152: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
157: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
15c: 66 0f f4 c2 pmuludq xmm0,xmm2
160: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
165: 66 0f 62 c8 punpckldq xmm1,xmm0
169: 66 0f 70 c1 4e pshufd xmm0,xmm1,0x4e
16e: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
173: 66 0f f4 c8 pmuludq xmm1,xmm0
177: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
17c: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
181: 66 0f f4 c2 pmuludq xmm0,xmm2
185: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
18a: 66 0f 62 c8 punpckldq xmm1,xmm0
18e: 66 0f 70 c1 e5 pshufd xmm0,xmm1,0xe5
193: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
198: 66 0f f4 c8 pmuludq xmm1,xmm0
19c: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
1a1: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
1a6: 66 0f f4 c2 pmuludq xmm0,xmm2
1aa: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
1af: 66 0f 62 c8 punpckldq xmm1,xmm0
1b3: 66 0f 7e c8 movd eax,xmm1
1b7: 39 fa cmp edx,edi
1b9: 74 0c je 1c7 <_Z9factorialj+0x1c7>
1bb: 44 29 c7 sub edi,r8d
1be: 66 90 xchg ax,ax
1c0: 0f af c7 imul eax,edi
if (n == 0)
1c3: ff cf dec edi
1c5: 75 f9 jne 1c0 <_Z9factorialj+0x1c0>
}
1c7: c3 ret
1c8: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
1cf: 00
00000000000001d0 <main>:
int main()
{
return factorial(0);
1d0: b8 01 00 00 00 mov eax,0x1
1d5: c3 ret
I understand they both notice they can unroll the loop and precompute the value but I don't get why clang generates all this code (and I have no idea what this can be ever doing and why there is sse stuff...)
Bonus question: while gcc precomputes the value up to factorial(7), clang computes it for any value.
Up to factorial(31)
the values look fine, for factorial(32)
and factorial(33)
it returns 0x80000000
and for greater values it does xor eax, eax
instead. What is this black magic ?