Decompose your numbers a and b into 32 bits parts :
a = a1 * 2**32 + a0
b = b1 * 2**32 + b0
a * b = (a1 * b1) * 2**64 + (a1 * b0 + a0 * b1) * 2**32 + a0 * b0
To obtain the 64 high bits of the result, the a1 * b1 part is obvious, the other part should be decomposed : first add the 32 high bits of a1 * b0 to the 32 high bits of a0 * b1; second add the 32 low bits of (a1 * b0 + a0 * b1) to the 32 high bits of a0 * b0 and retain the 32 high bits of this intermediate result (actually 1 significant bit) in order to take into account overflow from low bits before discarding them.
Below is the code with a basic check of the result.
#include <cstdint>
#include <iostream>
using namespace std;
inline uint64_t high32(uint64_t x) {
return x >> 32;
}
inline uint64_t low32(uint64_t x) {
return static_cast<uint32_t>(x);
}
uint64_t mul64(uint64_t a, uint64_t b)
{
uint64_t a1 = high32(a);
uint64_t a0 = low32(a);
uint64_t b1 = high32(b);
uint64_t b0 = low32(b);
uint64_t a1_b0 = a1 * b0;
uint64_t a0_b1 = a0 * b1;
return a1 * b1 + high32(a1_b0) + high32(a0_b1)
+ high32(low32(a1_b0 + a0_b1) + high32(a0 * b0));
}
int main()
{
cout << mul64(UINT64_MAX, UINT64_MAX) << endl;
cout << UINT64_MAX - 1 << endl;
cout << endl;
cout << mul64(UINT64_MAX - 1, UINT64_MAX) << endl;
cout << UINT64_MAX - 2 << endl;
cout << endl;
cout << mul64(UINT64_MAX - 2, UINT64_MAX) << endl;
cout << UINT64_MAX - 3 << endl;
}