0

Here is the code :

unsigned int factorial(unsigned int n) {
    if (n == 0) return 1;
    return n * factorial(n - 1);
}

int main() {
    return factorial(0);
}

I generate both gcc and clang assemblies using

g++ -g -O2 -c factorial.cpp (resp clang++)
objdump -d -M intel -S factorial.o

Here is what I get for gcc

factorial.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
    if (n == 0)
   0:   85 ff                   test   edi,edi
   2:   b8 01 00 00 00          mov    eax,0x1
   7:   74 11                   je     1a <_Z9factorialj+0x1a>
   9:   0f 1f 80 00 00 00 00    nop    DWORD PTR [rax+0x0]
  10:   0f af c7                imul   eax,edi
  13:   83 ef 01                sub    edi,0x1
  16:   75 f8                   jne    10 <_Z9factorialj+0x10>
  18:   f3 c3                   repz ret 
    {
        return 1;
    }

    return n * factorial(n - 1);
}
  1a:   f3 c3                   repz ret 

Disassembly of section .text.startup:

0000000000000000 <main>:
    if (n == 0)
   0:   b8 01 00 00 00          mov    eax,0x1
   5:   c3                      ret    

and for clang

factorial.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
   0:   b8 01 00 00 00          mov    eax,0x1
    if (n == 0)
   5:   85 ff                   test   edi,edi
   7:   0f 84 ba 01 00 00       je     1c7 <_Z9factorialj+0x1c7>
   d:   b8 01 00 00 00          mov    eax,0x1
    {
        return 1;
    }

    return n * factorial(n - 1);
  12:   83 ff 08                cmp    edi,0x8
  15:   0f 82 a5 01 00 00       jb     1c0 <_Z9factorialj+0x1c0>
  1b:   41 89 f8                mov    r8d,edi
  1e:   41 83 e0 f8             and    r8d,0xfffffff8
  22:   89 fa                   mov    edx,edi
  24:   83 e2 f8                and    edx,0xfffffff8
  27:   0f 84 93 01 00 00       je     1c0 <_Z9factorialj+0x1c0>
  2d:   8d 4f f8                lea    ecx,[rdi-0x8]
  30:   89 c8                   mov    eax,ecx
  32:   c1 e8 03                shr    eax,0x3
  35:   0f ba e1 03             bt     ecx,0x3
  39:   72 24                   jb     5f <_Z9factorialj+0x5f>
  3b:   66 0f 6e c7             movd   xmm0,edi
  3f:   66 0f 70 c8 00          pshufd xmm1,xmm0,0x0
  44:   66 0f 6f 05 00 00 00    movdqa xmm0,XMMWORD PTR [rip+0x0]        # 4c <_Z9factorialj+0x4c>
  4b:   00 
  4c:   66 0f fe c1             paddd  xmm0,xmm1
  50:   66 0f fe 0d 00 00 00    paddd  xmm1,XMMWORD PTR [rip+0x0]        # 58 <_Z9factorialj+0x58>
  57:   00 
  58:   b9 08 00 00 00          mov    ecx,0x8
  5d:   eb 0e                   jmp    6d <_Z9factorialj+0x6d>
  5f:   66 0f 6f 05 00 00 00    movdqa xmm0,XMMWORD PTR [rip+0x0]        # 67 <_Z9factorialj+0x67>
  66:   00 
  67:   31 c9                   xor    ecx,ecx
  69:   66 0f 6f c8             movdqa xmm1,xmm0
  6d:   85 c0                   test   eax,eax
  6f:   0f 84 d4 00 00 00       je     149 <_Z9factorialj+0x149>
  75:   89 d0                   mov    eax,edx
  77:   29 c8                   sub    eax,ecx
  79:   89 fe                   mov    esi,edi
  7b:   29 ce                   sub    esi,ecx
  7d:   66 0f 6f 15 00 00 00    movdqa xmm2,XMMWORD PTR [rip+0x0]        # 85 <_Z9factorialj+0x85>
  84:   00 
  85:   66 0f 6f 1d 00 00 00    movdqa xmm3,XMMWORD PTR [rip+0x0]        # 8d <_Z9factorialj+0x8d>
  8c:   00 
  8d:   0f 1f 00                nop    DWORD PTR [rax]
  90:   66 0f 6e e6             movd   xmm4,esi
  94:   66 0f 70 e4 00          pshufd xmm4,xmm4,0x0
  99:   66 0f 6f ec             movdqa xmm5,xmm4
  9d:   66 0f fe ea             paddd  xmm5,xmm2
  a1:   66 0f fe e3             paddd  xmm4,xmm3
  a5:   66 0f 70 f5 f5          pshufd xmm6,xmm5,0xf5
  aa:   66 0f f4 e8             pmuludq xmm5,xmm0
  ae:   66 0f 70 ed e8          pshufd xmm5,xmm5,0xe8
  b3:   66 0f 70 c0 f5          pshufd xmm0,xmm0,0xf5
  b8:   66 0f f4 c6             pmuludq xmm0,xmm6
  bc:   66 0f 70 c0 e8          pshufd xmm0,xmm0,0xe8
  c1:   66 0f 62 e8             punpckldq xmm5,xmm0
  c5:   66 0f 70 c4 f5          pshufd xmm0,xmm4,0xf5
  ca:   66 0f f4 e1             pmuludq xmm4,xmm1
  ce:   66 0f 70 e4 e8          pshufd xmm4,xmm4,0xe8
  d3:   66 0f 70 c9 f5          pshufd xmm1,xmm1,0xf5
  d8:   66 0f f4 c8             pmuludq xmm1,xmm0
  dc:   66 0f 70 c1 e8          pshufd xmm0,xmm1,0xe8
  e1:   66 0f 62 e0             punpckldq xmm4,xmm0
  e5:   8d 4e f8                lea    ecx,[rsi-0x8]
  e8:   66 0f 6e c1             movd   xmm0,ecx
  ec:   66 0f 70 c8 00          pshufd xmm1,xmm0,0x0
  f1:   66 0f 6f c1             movdqa xmm0,xmm1
  f5:   66 0f fe c2             paddd  xmm0,xmm2
  f9:   66 0f fe cb             paddd  xmm1,xmm3
  fd:   66 0f 70 f0 f5          pshufd xmm6,xmm0,0xf5
 102:   66 0f f4 c5             pmuludq xmm0,xmm5
 106:   66 0f 70 c0 e8          pshufd xmm0,xmm0,0xe8
 10b:   66 0f 70 ed f5          pshufd xmm5,xmm5,0xf5
 110:   66 0f f4 ee             pmuludq xmm5,xmm6
 114:   66 0f 70 ed e8          pshufd xmm5,xmm5,0xe8
 119:   66 0f 62 c5             punpckldq xmm0,xmm5
 11d:   66 0f 70 e9 f5          pshufd xmm5,xmm1,0xf5
 122:   66 0f f4 cc             pmuludq xmm1,xmm4
 126:   66 0f 70 c9 e8          pshufd xmm1,xmm1,0xe8
 12b:   66 0f 70 e4 f5          pshufd xmm4,xmm4,0xf5
 130:   66 0f f4 e5             pmuludq xmm4,xmm5
 134:   66 0f 70 e4 e8          pshufd xmm4,xmm4,0xe8
 139:   66 0f 62 cc             punpckldq xmm1,xmm4
 13d:   83 c6 f0                add    esi,0xfffffff0
 140:   83 c0 f0                add    eax,0xfffffff0
 143:   0f 85 47 ff ff ff       jne    90 <_Z9factorialj+0x90>
 149:   66 0f 70 d1 f5          pshufd xmm2,xmm1,0xf5
 14e:   66 0f f4 c8             pmuludq xmm1,xmm0
 152:   66 0f 70 c9 e8          pshufd xmm1,xmm1,0xe8
 157:   66 0f 70 c0 f5          pshufd xmm0,xmm0,0xf5
 15c:   66 0f f4 c2             pmuludq xmm0,xmm2
 160:   66 0f 70 c0 e8          pshufd xmm0,xmm0,0xe8
 165:   66 0f 62 c8             punpckldq xmm1,xmm0
 169:   66 0f 70 c1 4e          pshufd xmm0,xmm1,0x4e
 16e:   66 0f 70 d1 f5          pshufd xmm2,xmm1,0xf5
 173:   66 0f f4 c8             pmuludq xmm1,xmm0
 177:   66 0f 70 c9 e8          pshufd xmm1,xmm1,0xe8
 17c:   66 0f 70 c0 f5          pshufd xmm0,xmm0,0xf5
 181:   66 0f f4 c2             pmuludq xmm0,xmm2
 185:   66 0f 70 c0 e8          pshufd xmm0,xmm0,0xe8
 18a:   66 0f 62 c8             punpckldq xmm1,xmm0
 18e:   66 0f 70 c1 e5          pshufd xmm0,xmm1,0xe5
 193:   66 0f 70 d1 f5          pshufd xmm2,xmm1,0xf5
 198:   66 0f f4 c8             pmuludq xmm1,xmm0
 19c:   66 0f 70 c9 e8          pshufd xmm1,xmm1,0xe8
 1a1:   66 0f 70 c0 f5          pshufd xmm0,xmm0,0xf5
 1a6:   66 0f f4 c2             pmuludq xmm0,xmm2
 1aa:   66 0f 70 c0 e8          pshufd xmm0,xmm0,0xe8
 1af:   66 0f 62 c8             punpckldq xmm1,xmm0
 1b3:   66 0f 7e c8             movd   eax,xmm1
 1b7:   39 fa                   cmp    edx,edi
 1b9:   74 0c                   je     1c7 <_Z9factorialj+0x1c7>
 1bb:   44 29 c7                sub    edi,r8d
 1be:   66 90                   xchg   ax,ax
 1c0:   0f af c7                imul   eax,edi
    if (n == 0)
 1c3:   ff cf                   dec    edi
 1c5:   75 f9                   jne    1c0 <_Z9factorialj+0x1c0>
}
 1c7:   c3                      ret    
 1c8:   0f 1f 84 00 00 00 00    nop    DWORD PTR [rax+rax*1+0x0]
 1cf:   00 

00000000000001d0 <main>:

int main()
{
    return factorial(0);
 1d0:   b8 01 00 00 00          mov    eax,0x1
 1d5:   c3                      ret    

I understand they both notice they can unroll the loop and precompute the value but I don't get why clang generates all this code (and I have no idea what this can be ever doing and why there is sse stuff...)

Bonus question: while gcc precomputes the value up to factorial(7), clang computes it for any value. Up to factorial(31) the values look fine, for factorial(32) and factorial(33) it returns 0x80000000 and for greater values it does xor eax, eax instead. What is this black magic ?

cmourglia
  • 2,423
  • 1
  • 17
  • 33
  • what is the non-bonus question? – 463035818_is_not_an_ai Oct 27 '16 at 16:24
  • Why does clang generates all this code ? (Maybe what does it do too, but I guess the answer is nothing since it's not called) – cmourglia Oct 27 '16 at 16:26
  • For the bonus question: n! expands very fast. 32! has 31 2's in it's prime expansion. - so if you truncate 32! to 32 bits (which is typical for unsigned int), you get 0x80000000. Multiply that by 33, and nothing changes. Multiply by 34 and the last bit is shifted off the top. – Martin Bonner supports Monica Oct 27 '16 at 16:37
  • Oh ok the `xor eax, eax` is just for setting `eax` to 0, did not know this trick. – cmourglia Oct 27 '16 at 16:41
  • Links explaining [why the xor-zeroing idiom is good](http://stackoverflow.com/a/33668295/224132) and more in the [x86 tag wiki](http://stackoverflow.com/tags/x86/info) – Peter Cordes Oct 28 '16 at 02:24
  • clang enables auto-vectorization at `-O2`, but gcc only does that at `-O3`. What the heck are you talking about with gcc "pre-computing the value", or unrolling? It does neither of those things. It just loop `fact=1; for(i=n ; n>0 ; --n) fact *= n;` Or are you talking about when it inlines into `main` with different constant args? gcc auto-vectorizes factorial at -O3, too, and succeeds at evaluating at compile time for args > 7. ([Godbolt](https://godbolt.org/g/l15S50)). – Peter Cordes Oct 28 '16 at 02:31
  • *What does the SSE code do? nothing since it's not called?*: It correctly implements the factorial function, for any possible arg, of course. Since you made the function global (not `static` to this compilation unit), the compiler has to emit a definition that follows the calling convention and works for all possible args. I'm not sure how it works either; but I see a lot of shuffling. The compiler has an easier time with SSE4.1 (or AVX2) (since it can use PMULLD). – Peter Cordes Oct 28 '16 at 02:34

0 Answers0