I want to know if there will be a performance difference in the following loops?
.LOOP1:
mov edi, dword ptr [rbx]
call f(int)@PLT
add rbx, 4
cmp rbx, r14
jne .LOOP1
.LOOP2:
mov edi, dword ptr [r14]
call f(int)@PLT
add r14, 4
cmp r14, qword ptr [rbx + 8]
jne .LOOP2
The only significant difference is the cmp
instruction. Assuming the cmp in LOOP1 will just take one cycle; Will the cmp in LOOP2 instruction take more than a cycle because it has to first compute [rbx + 8]
?
EDIT: The code is coming from the two different ways we access elements of a vector. The range based for-loop is slightly better IMO if we consider one cmp to have different performance than the other.
https://godbolt.org/z/z9d94WGYh
#include <vector>
using namespace std;
int f(int);
void range(const vector<int> &input) {
for (int i : input) {
f(i);
}
}
void iter(const vector<int> &input) {
for (auto i = input.begin(); i != input.end(); ++i) {
f(*i);
}
}
$ gcc -O3
range(std::vector<int, std::allocator<int> > const&):
push rbp
push rbx
sub rsp, 8
mov rbx, QWORD PTR [rdi]
mov rbp, QWORD PTR [rdi+8]
cmp rbp, rbx
je .L1
.L3:
mov edi, DWORD PTR [rbx]
add rbx, 4
call f(int)
cmp rbp, rbx
jne .L3
.L1:
add rsp, 8
pop rbx
pop rbp
ret
iter(std::vector<int, std::allocator<int> > const&):
push rbp
push rbx
sub rsp, 8
mov rbx, QWORD PTR [rdi]
cmp rbx, QWORD PTR [rdi+8]
je .L7
mov rbp, rdi
.L9:
mov edi, DWORD PTR [rbx]
add rbx, 4
call f(int)
cmp QWORD PTR [rbp+8], rbx
jne .L9
.L7:
add rsp, 8
pop rbx
pop rbp
ret