On amd64 / x86_64, fastest to slowest are:
- trailing zeros
- leading zeros / trailing ones
- leading ones
On arm64 / aarch64, fastest to slowest are:
- leading zeros,
- leading ones / trailing zeros (tied)
- trailing ones
Test results from godbolt.org:
pub fn lz(num: u64) -> u32 {
num.leading_zeros()
}
pub fn lo(num: u64) -> u32 {
num.leading_ones()
}
pub fn tz(num: u64) -> u32 {
num.trailing_zeros()
}
pub fn to(num: u64) -> u32 {
num.trailing_ones()
}
amd64 / x86_64:
example::lz:
test rdi, rdi
je .LBB0_1
bsr rax, rdi
xor rax, 63
ret
.LBB0_1:
mov eax, 64
ret
example::lo:
not rdi
test rdi, rdi
je .LBB1_1
bsr rax, rdi
xor rax, 63
ret
.LBB1_1:
mov eax, 64
ret
example::tz:
test rdi, rdi
je .LBB2_1
bsf rax, rdi
ret
.LBB2_1:
mov eax, 64
ret
example::to:
not rdi
test rdi, rdi
je .LBB3_1
bsf rax, rdi
ret
.LBB3_1:
mov eax, 64
ret
arm64 / aarch64:
example::lz:
clz x0, x0
ret
example::lo:
mvn x8, x0
clz x0, x8
ret
example::tz:
rbit x8, x0
clz x0, x8
ret
example::to:
mvn x8, x0
rbit x8, x8
clz x0, x8
ret