If you are working on x86_64 then the asm supports 128 bit integers:
int64_t fn(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
asm (
"mulq %1\n" // a *= b
"movq %%rbx, %%rdx\n"// rbx = upper 64 bit of the multiplication
"mulq %2\n" // multiply the lower 64 bits by c
"push %%rax\n" // temporarily save the lowest 64 bits on the stack
"mov %%rcx, %%rdx\n" // rcx = upper 64 bits of the multiplication
"movq %%rax, %%rbx\n"//
"mulq %2\n" // multiply the upper 64 bits by c
"addq %%rax, %%rcx\n"// combine the middle 64 bits
"addcq %%rdx, $0\n" // transfer carry tp the higest 64 bits if present
"divq %3\n" // divide the upper 128 (of 192) bits by d
"mov %%rbx, %%rax\n" // rbx = result
"pop %%rax\n"
"divq %3\n" // divide remainder:lower 64 bits by d
: "+a" (a) // assigns a to rax register as in/out
, "+b" (b) // assigns b to rbx register
: "g" (c) // assigns c to random register
, "g" (d) // assigns d to random register
: "edx", "rdx" // tells the compiler that edx/rdx will be used internally, but does not need any input
);
// b now holds the upper 64 bit if (a * b * c / d) > UINT64_MAX
return a;
}
Please note that all input integers have to be the same length. Working length will be double the input. Works with unsigned only.
The native div
and mul
instructions on x86 work on double-length exactly to allow for overflows. Sadly I am unaware of a compiler intrinsic to make use of them.