If you're doing performance testing always build with --release
. By default Cargo builds with debugging information enabled and optimizations disabled. The optimizer will completely eliminate these loops. On the Playground it drops from 975ms to 1.25µs.
Let's take a look at the assembly on Godbolt for just the loops, no timer:
pub fn main() {
for i in 0..5000 {
for j in 0..5000 {}
}
}
Without optimization:
<i32 as core::iter::range::Step>::forward_unchecked:
push rax
mov eax, esi
add edi, eax
mov dword ptr [rsp + 4], edi
mov eax, dword ptr [rsp + 4]
mov dword ptr [rsp], eax
mov eax, dword ptr [rsp]
pop rcx
ret
core::intrinsics::copy_nonoverlapping:
push rax
mov qword ptr [rsp], rsi
mov rsi, rdi
mov rdi, qword ptr [rsp]
shl rdx, 2
call memcpy@PLT
pop rax
ret
core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt:
mov eax, dword ptr [rdi]
cmp eax, dword ptr [rsi]
setl al
and al, 1
movzx eax, al
ret
core::mem::replace:
sub rsp, 40
mov qword ptr [rsp], rdi
mov dword ptr [rsp + 12], esi
mov byte ptr [rsp + 23], 0
mov byte ptr [rsp + 23], 1
mov rax, qword ptr [rip + core::ptr::read@GOTPCREL]
call rax
mov ecx, eax
mov dword ptr [rsp + 16], ecx
jmp .LBB3_1
.LBB3_1:
mov esi, dword ptr [rsp + 12]
mov rdi, qword ptr [rsp]
mov byte ptr [rsp + 23], 0
mov rcx, qword ptr [rip + core::ptr::write@GOTPCREL]
call rcx
jmp .LBB3_4
.LBB3_2:
test byte ptr [rsp + 23], 1
jne .LBB3_8
jmp .LBB3_7
mov rcx, rax
mov eax, edx
mov qword ptr [rsp + 24], rcx
mov dword ptr [rsp + 32], eax
jmp .LBB3_2
.LBB3_4:
mov eax, dword ptr [rsp + 16]
add rsp, 40
ret
.LBB3_5:
jmp .LBB3_2
mov rcx, rax
mov eax, edx
mov qword ptr [rsp + 24], rcx
mov dword ptr [rsp + 32], eax
jmp .LBB3_5
.LBB3_7:
mov rdi, qword ptr [rsp + 24]
call _Unwind_Resume@PLT
ud2
.LBB3_8:
jmp .LBB3_7
core::ptr::read:
sub rsp, 24
mov qword ptr [rsp + 8], rdi
mov eax, dword ptr [rsp + 20]
mov dword ptr [rsp + 16], eax
jmp .LBB4_2
.LBB4_2:
mov rdi, qword ptr [rsp + 8]
lea rsi, [rsp + 16]
mov edx, 1
call qword ptr [rip + core::intrinsics::copy_nonoverlapping@GOTPCREL]
mov eax, dword ptr [rsp + 16]
mov dword ptr [rsp + 4], eax
mov eax, dword ptr [rsp + 4]
add rsp, 24
ret
core::ptr::write:
sub rsp, 4
mov dword ptr [rsp], esi
mov eax, dword ptr [rsp]
mov dword ptr [rdi], eax
add rsp, 4
ret
core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next:
push rax
call qword ptr [rip + <core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next@GOTPCREL]
mov dword ptr [rsp], eax
mov dword ptr [rsp + 4], edx
mov edx, dword ptr [rsp + 4]
mov eax, dword ptr [rsp]
pop rcx
ret
core::clone::impls::<impl core::clone::Clone for i32>::clone:
mov eax, dword ptr [rdi]
ret
<I as core::iter::traits::collect::IntoIterator>::into_iter:
mov edx, esi
mov eax, edi
ret
<core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next:
sub rsp, 40
mov rsi, rdi
mov qword ptr [rsp + 16], rsi
mov rdi, rsi
add rsi, 4
call core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt
mov byte ptr [rsp + 31], al
mov al, byte ptr [rsp + 31]
test al, 1
jne .LBB9_3
jmp .LBB9_2
.LBB9_2:
mov dword ptr [rsp + 32], 0
jmp .LBB9_7
.LBB9_3:
mov rdi, qword ptr [rsp + 16]
call core::clone::impls::<impl core::clone::Clone for i32>::clone
mov dword ptr [rsp + 12], eax
mov edi, dword ptr [rsp + 12]
mov esi, 1
call <i32 as core::iter::range::Step>::forward_unchecked
mov dword ptr [rsp + 8], eax
mov esi, dword ptr [rsp + 8]
mov rdi, qword ptr [rsp + 16]
call qword ptr [rip + core::mem::replace@GOTPCREL]
mov dword ptr [rsp + 4], eax
mov eax, dword ptr [rsp + 4]
mov dword ptr [rsp + 36], eax
mov dword ptr [rsp + 32], 1
.LBB9_7:
mov eax, dword ptr [rsp + 32]
mov edx, dword ptr [rsp + 36]
add rsp, 40
ret
example::main:
sub rsp, 72
mov dword ptr [rsp + 24], 0
mov dword ptr [rsp + 28], 5000
mov edi, dword ptr [rsp + 24]
mov esi, dword ptr [rsp + 28]
call qword ptr [rip + <I as core::iter::traits::collect::IntoIterator>::into_iter@GOTPCREL]
mov dword ptr [rsp + 16], eax
mov dword ptr [rsp + 20], edx
mov eax, dword ptr [rsp + 20]
mov ecx, dword ptr [rsp + 16]
mov dword ptr [rsp + 32], ecx
mov dword ptr [rsp + 36], eax
.LBB10_2:
mov rax, qword ptr [rip + core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next@GOTPCREL]
lea rdi, [rsp + 32]
call rax
mov dword ptr [rsp + 44], edx
mov dword ptr [rsp + 40], eax
mov eax, dword ptr [rsp + 40]
test rax, rax
je .LBB10_5
jmp .LBB10_13
.LBB10_13:
jmp .LBB10_6
ud2
.LBB10_5:
add rsp, 72
ret
.LBB10_6:
mov dword ptr [rsp + 48], 0
mov dword ptr [rsp + 52], 5000
mov edi, dword ptr [rsp + 48]
mov esi, dword ptr [rsp + 52]
call qword ptr [rip + <I as core::iter::traits::collect::IntoIterator>::into_iter@GOTPCREL]
mov dword ptr [rsp + 8], eax
mov dword ptr [rsp + 12], edx
mov eax, dword ptr [rsp + 12]
mov ecx, dword ptr [rsp + 8]
mov dword ptr [rsp + 56], ecx
mov dword ptr [rsp + 60], eax
.LBB10_8:
mov rax, qword ptr [rip + core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next@GOTPCREL]
lea rdi, [rsp + 56]
call rax
mov dword ptr [rsp + 68], edx
mov dword ptr [rsp + 64], eax
mov eax, dword ptr [rsp + 64]
test rax, rax
je .LBB10_11
jmp .LBB10_14
.LBB10_14:
jmp .LBB10_12
ud2
.LBB10_11:
jmp .LBB10_2
.LBB10_12:
jmp .LBB10_8
__rustc_debug_gdb_scripts_section__:
.asciz "\001gdb_load_rust_pretty_printers.py"
DW.ref.rust_eh_personality:
.quad rust_eh_personality
With optimization
example::main:
ret