I have found that manually calculating the %
operator on __int128
is significantly faster than the built-in compiler operator. I will show you how to calculate modulo 9, but the method can be used to calculate modulo any other number.
First, consider the built-in compiler operator:
uint64_t mod9_v1(unsigned __int128 n)
{
return n % 9;
}
Now consider my manual implementation:
uint64_t mod9_v2(unsigned __int128 n)
{
uint64_t r = 0;
r += (uint32_t)(n);
r += (uint32_t)(n >> 32) * (uint64_t)4;
r += (uint32_t)(n >> 64) * (uint64_t)7;
r += (uint32_t)(n >> 96);
return r % 9;
}
Measuring over 100,000,000 random numbers gives the following results:
mod9_v1 | 3.986052 secs
mod9_v2 | 1.814339 secs
GCC 9.3.0 with -march=native -O3
was used on AMD Ryzen Threadripper 2990WX.
Here is a link to godbolt.
I would like to ask if it behaves the same way on your side? (Before reporting a bug to GCC Bugzilla).
UPDATE: On request, I supply a generated assembly:
mod9_v1:
sub rsp, 8
mov edx, 9
xor ecx, ecx
call __umodti3
add rsp, 8
ret
mod9_v2:
mov rax, rdi
shrd rax, rsi, 32
mov rdx, rsi
mov r8d, eax
shr rdx, 32
mov eax, edi
add rax, rdx
lea rax, [rax+r8*4]
mov esi, esi
lea rcx, [rax+rsi*8]
sub rcx, rsi
mov rax, rcx
movabs rdx, -2049638230412172401
mul rdx
mov rax, rdx
shr rax, 3
and rdx, -8
add rdx, rax
mov rax, rcx
sub rax, rdx
ret