I wrote this snippet in a recent argument over the supposed speed of array[i++]
vs array[i]; i++
.
int array[10];
int main(){
int i=0;
while(i < 10){
array[i] = 0;
i++;
}
return 0;
}
Snippet at the compiler explorer: https://godbolt.org/g/de7TY2
As expected, the compiler output identical asm for array[i++]
and array[i]; i++
with at least -O1
. However what surprised me was the placement of the xor eax, eax
seemingly randomly in the function at higher optimization levels.
GCC
At -O2
, GCC places the xor
before the ret
as expected
mov DWORD PTR [rax], 0
add rax, 4
cmp rax, OFFSET FLAT:array+40
jne .L2
xor eax, eax
ret
However it places the xor after the second mov
at -O3
mov QWORD PTR array[rip], 0
mov QWORD PTR array[rip+8], 0
xor eax, eax
mov QWORD PTR array[rip+16], 0
mov QWORD PTR array[rip+24], 0
mov QWORD PTR array[rip+32], 0
ret
icc
icc places it normally at -O1
:
push rsi
xor esi, esi
push 3
pop rdi
call __intel_new_feature_proc_init
stmxcsr DWORD PTR [rsp]
xor eax, eax
or DWORD PTR [rsp], 32832
ldmxcsr DWORD PTR [rsp]
..B1.2:
mov DWORD PTR [array+rax*4], 0
inc rax
cmp rax, 10
jl ..B1.2
xor eax, eax
pop rcx
ret
but in a strange place at -O2
push rbp
mov rbp, rsp
and rsp, -128
sub rsp, 128
xor esi, esi
mov edi, 3
call __intel_new_feature_proc_init
stmxcsr DWORD PTR [rsp]
pxor xmm0, xmm0
xor eax, eax
or DWORD PTR [rsp], 32832
ldmxcsr DWORD PTR [rsp]
movdqu XMMWORD PTR array[rip], xmm0
movdqu XMMWORD PTR 16+array[rip], xmm0
mov DWORD PTR 32+array[rip], eax
mov DWORD PTR 36+array[rip], eax
mov rsp, rbp
pop rbp
ret
and -O3
and rsp, -128
sub rsp, 128
mov edi, 3
call __intel_new_proc_init
stmxcsr DWORD PTR [rsp]
xor eax, eax
or DWORD PTR [rsp], 32832
ldmxcsr DWORD PTR [rsp]
mov rsp, rbp
pop rbp
ret
Clang
only clang places the xor
directly in front of the ret
at all optimization levels:
xorps xmm0, xmm0
movaps xmmword ptr [rip + array+16], xmm0
movaps xmmword ptr [rip + array], xmm0
mov qword ptr [rip + array+32], 0
xor eax, eax
ret
Since GCC and ICC both do this at higher optimisation levels, I presume there must be some kind of good reason.
Why do some compilers do this?
The code is semantically identical of course and the compiler can reorder it as it wishes, but since this only changes at higher optimization levels this must be caused by some kind of optimization.