Godbolt Link: https://godbolt.org/g/Hv6MAL
typedef int cell;
cell y;
const cell *phys_addr = (const cell*)0x12340;
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
y = subsubarray[k];
}
}
}
}
It feels natural to expect the compiler to optimize the above code to something similar to:
int main() {
for (int i = 0; i < 20; i++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
for (int j = 0; j < 30; j++) {
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
for (int k = 0; k < 50; k++) {
y = subsubarray[k];
}
}
}
}
but the assembly generated by gcc 8.2 with -O3 -m32
as flags is:
push ebp
push edi
push esi
push ebx
sub esp, 8
mov eax, DWORD PTR phys_addr
mov DWORD PTR [esp], 0
mov DWORD PTR [esp+4], eax
mov ebp, eax
.L4:
xor esi, esi
.L3:
lea edi, [0+esi*4]
xor eax, eax
.L2:
mov edx, DWORD PTR [ebp+0]
mov ecx, DWORD PTR [esp+4]
shr edx, 2
add edx, DWORD PTR [esp]
lea ebx, [ecx+edx*4]
lea edx, [eax+esi]
add eax, 1
mov ecx, DWORD PTR [ebx+edi]
shr ecx, 2
add edx, ecx
mov edx, DWORD PTR [ebx+edx*4]
mov DWORD PTR y, edx
cmp eax, 50
jne .L2
add esi, 1
cmp esi, 30
jne .L3
add DWORD PTR [esp], 1
mov eax, DWORD PTR [esp]
add ebp, 4
cmp eax, 20
jne .L4
add esp, 8
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
Why isn't the compiler moving the subarray
and subsubarray
calculation outside the inner loops?
random volatile
does magic
I randomly added volatile
to prevent DCE from getting rid of all the code and then somehow the loop invariants got hoisted out of the inner loops.
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
volatile cell y = subsubarray[k];
}
}
}
return 0;
}
This mostly wasn't because of y
being a local variable since using std::cout << subsubarray[k];
prevented the optimization.
The assembly generated by gcc 8.2 with -O3 -m32
as flags for the aforementioned code is:
main:
push ebp
push edi
xor edi, edi
push esi
push ebx
sub esp, 20
mov ebp, DWORD PTR phys_addr
.L4:
mov eax, DWORD PTR [ebp+0+edi*4]
xor ecx, ecx
shr eax, 2
add eax, edi
lea ebx, [ebp+0+eax*4]
lea esi, [ebx+200]
.L3:
mov edx, DWORD PTR [ebx+ecx*4]
mov DWORD PTR [esp], ecx
shr edx, 2
add edx, ecx
sal edx, 2
lea eax, [ebx+edx]
add edx, esi
.L2:
mov ecx, DWORD PTR [eax]
add eax, 4
mov DWORD PTR [esp+16], ecx
cmp edx, eax
jne .L2
mov ecx, DWORD PTR [esp]
add ecx, 1
cmp ecx, 30
jne .L3
add edi, 1
cmp edi, 20
jne .L4
add esp, 20
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
The loop invariants are pushed out of the inner loops. What did the random volatile
do to allow the GCC to optimize the invariants? The optimization does not happen when clang 6.0.0.