Suppose there's is a recursive function foo()
similar to Fibonacci's Sequence:
constexpr inline int foo(const int i)
{
if(i == 0)
return 1;
return foo(i - 1) + foo(i - 2);
}
int main(int argc, const char* argv[])
{
const int ret = foo(argc);
return ret;
}
Then use GCC
on https://gcc.godbolt.org/z/fETb6Px33to compile it under different optimization levels like -Os -std=c++2b -fno-exceptions
, I wonder why the O2/O3
version seems to be so big compares to O1/Os
version, just like the monster Godzilla. Apparently, some critical optimization triggers the inflation of binary size, maybe the O3
tried to boost the executive speed in runtime at the cost of size.
O0
foo(int):
push rbp
mov rbp, rsp
push rbx
sub rsp, 24
mov DWORD PTR [rbp-20], edi
cmp DWORD PTR [rbp-20], 0
jne .L2
mov eax, 1
jmp .L3
.L2:
mov eax, DWORD PTR [rbp-20]
sub eax, 1
mov edi, eax
call foo(int)
mov ebx, eax
mov eax, DWORD PTR [rbp-20]
sub eax, 2
mov edi, eax
call foo(int)
add eax, ebx
.L3:
mov rbx, QWORD PTR [rbp-8]
leave
ret
main:
push rbp
mov rbp, rsp
sub rsp, 32
mov DWORD PTR [rbp-20], edi
mov QWORD PTR [rbp-32], rsi
mov eax, DWORD PTR [rbp-20]
mov edi, eax
call foo(int)
mov DWORD PTR [rbp-4], eax
mov eax, DWORD PTR [rbp-4]
leave
ret
O3
foo(int):
test edi, edi
je .L44
push r15
push r14
xor r14d, r14d
push r13
push r12
lea r12d, [rdi-1]
push rbp
mov ebp, r12d
push rbx
sub rsp, 56
.L7:
test ebp, ebp
je .L45
.L3:
lea r13d, [rbp-1]
xor ebx, ebx
mov eax, r14d
mov r15d, ebp
.L10:
lea ebx, [r15-1]
.... // much much more to follow
Os
foo(int):
push rbp
xor ebp, ebp
push rbx
mov ebx, edi
push rcx
.L3:
test ebx, ebx
je .L5
lea edi, [rbx-1]
sub ebx, 2
call foo(int)
add ebp, eax
jmp .L3
.L5:
lea eax, [rbp+1]
pop rdx
pop rbx
pop rbp
ret
main:
jmp foo(int)