I was curious about the differences in performance when dividing two values with varying number of value-bits but constant operand (register) bits. I expected the performance to be dependent of the highest set bit of the dividend since I assume an iterative subtract and shift "algorithm" inside the CPU, may be doing multiple iterations in one clock cycle.
So I wrote a little C++20-program that tests this:
#include <iostream>
#include <iostream>
#include <type_traits>
#include <cstdint>
#include <random>
#include <limits>
#include <atomic>
#include <chrono>
#include <vector>
#include <sstream>
using namespace std;
using namespace chrono;
int main()
{
constexpr size_t ROUNDS = 100'000'000;
auto ssp = []<typename TOp, typename TValue>() -> double
requires is_unsigned_v<TOp> && is_unsigned_v<TValue> && (sizeof(TOp) >= sizeof(TValue))
{
constexpr size_t N = 0x1000;
vector<TOp> vDividends( N ), vDivisors( N );
mt19937_64 mt;
uniform_int_distribution<unsigned> uidBits( 0, sizeof(TValue) * CHAR_BIT - 1 );
uniform_int_distribution<uint64_t> uidValue( 0, numeric_limits<TValue>::max() );
auto getMaxBitsValue = [&]() -> TOp
{
for( TValue value; ; )
if( (make_signed_t<TValue>)(value = (TValue)uidValue( mt )) < 0 )
return value;
};
auto getDividend = [&]() -> TOp
{
return getMaxBitsValue() >> uidBits( mt );
};
auto getDivisor = [&]( TOp dividend ) -> TOp
{
for( TOp divisor; ; )
if( (divisor = getMaxBitsValue() >> uidBits( mt )) <= dividend )
return divisor;
};
for( size_t i = 0; i != N; ++i )
vDivisors[i] = getDivisor( (vDividends[i] = getDividend()) );
auto start = high_resolution_clock::now();
for( size_t r = ROUNDS; r--; )
sum += vDividends[r % N] / vDivisors[r % N];
double ns = (int64_t)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count() / (double)ROUNDS;
return ns;
};
auto results = []( double ns, double nsRef )
{
ostringstream oss;
oss << ns;
if( nsRef > 0.0 )
oss << " (" << (int)(ns / nsRef * 100.0 + 0.5) << "%)";
return oss.str();
};
double nsRef;
cout << "8v, 8op: " << results( (nsRef = ssp.template operator ()<uint8_t, uint8_t>()), 0.0 ) << endl;
cout << "8v, 16op: " << results( ssp.template operator ()<uint16_t, uint8_t>(), nsRef ) << endl;
cout << "8v, 32op: " << results( ssp.template operator ()<uint32_t, uint8_t>(), nsRef ) << endl;
cout << "8v, 64op: " << results( ssp.template operator ()<uint64_t, uint8_t>(), nsRef ) << endl;
cout << "16v, 16op: " << results( ssp.template operator ()<uint16_t, uint16_t>(), nsRef ) << endl;
cout << "16v, 32op: " << results( ssp.template operator ()<uint32_t, uint16_t>(), nsRef ) << endl;
cout << "16v, 64op: " << results( ssp.template operator ()<uint64_t, uint16_t>(), nsRef ) << endl;
cout << "32v, 32op: " << results( ssp.template operator ()<uint32_t, uint32_t>(), nsRef ) << endl;
cout << "32v, 64op: " << results( ssp.template operator ()<uint64_t, uint32_t>(), nsRef ) << endl;
cout << "64v, 64op: " << results( ssp.template operator ()<uint64_t, uint64_t>(), nsRef ) << endl;
}
The results are the same for the same number of value bits with varying operand-bits with slight measurements-errors. But what's the CPU-architectural reason behind that a division of a 64-bit value by another 64-bit value takes only about 50% more time than a division of a 8-bit value by another 8-bit value on my computer (Ryzen Threadripper 3990X) ? Even on my Linux computer equipped with a Ryzen 7 1800X the percentage relations are roughly the same.
I post this in "x86", "x86-64" and "assembly" and not in C++ since I'm asking not a for C++-issue but a machine-level issue.
[EDIT]: This is a newer version of the code for MSVC++ with some serialized assembly-code (MASM):
#include <iostream>
#include <iostream>
#include <type_traits>
#include <cstdint>
#include <random>
#include <limits>
#include <atomic>
#include <chrono>
#include <vector>
#include <sstream>
using namespace std;
using namespace chrono;
uint64_t divLoop( uint64_t *dividends, uint64_t *divisors, size_t n, size_t rounds );
uint32_t divLoop( uint32_t *dividends, uint32_t *divisors, size_t n, size_t rounds );
uint16_t divLoop( uint16_t *dividends, uint16_t *divisors, size_t n, size_t rounds );
uint8_t divLoop( uint8_t *dividends, uint8_t *divisors, size_t n, size_t rounds );
int main()
{
constexpr size_t ROUNDS = 100'000'000;
auto ssp = []<typename TOp, typename TValue>( TOp const &, TValue const & ) -> double
requires is_unsigned_v<TOp> && is_unsigned_v<TValue> && (sizeof(TOp) >= sizeof(TValue))
{
constexpr size_t N = 0x1000;
vector<TOp> vDividends( N ), vDivisors( N );
mt19937_64 mt;
uniform_int_distribution<unsigned> uidBits( 0, sizeof(TValue) * CHAR_BIT - 1 );
uniform_int_distribution<uint64_t> uidValue( 0, numeric_limits<TValue>::max() );
auto getMaxBitsValue = [&]() -> TOp
{
for( TValue value; ; )
if( (make_signed_t<TValue>)(value = (TValue)uidValue( mt )) < 0 )
return value;
};
auto getDividend = [&]() -> TOp
{
return getMaxBitsValue() >> uidBits( mt );
};
auto getDivisor = [&]( TOp dividend ) -> TOp
{
for( TOp divisor; ; )
if( (divisor = getMaxBitsValue() >> uidBits( mt )) <= dividend )
return divisor;
};
for( size_t i = 0; i != N; ++i )
vDivisors[i] = getDivisor( (vDividends[i] = getDividend()) );
TOp sum = 0;
atomic<TOp> aSum( 0 );
auto start = high_resolution_clock::now();
divLoop( &vDividends[0], &vDivisors[0], N, ROUNDS );
double ns = (int64_t)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count() / (double)ROUNDS;
aSum = sum;
return ns;
};
auto results = []( double ns, double nsRef )
{
ostringstream oss;
oss << ns;
if( nsRef > 0.0 )
oss << " (" << (int)(ns / nsRef * 100.0 + 0.5) << "%)";
return oss.str();
};
double nsRef;
cout << "8v, 8op: " << results( (nsRef = ssp( uint8_t(), uint8_t() )), 0.0 ) << endl;
cout << "8v, 16op: " << results( ssp( uint16_t(), uint8_t() ), nsRef ) << endl;
cout << "8v, 32op: " << results( ssp( uint32_t(), uint8_t() ), nsRef ) << endl;
cout << "8v, 64op: " << results( ssp( uint64_t(), uint8_t() ), nsRef ) << endl;
cout << "16v, 16op: " << results( ssp( uint16_t(), uint16_t() ), nsRef ) << endl;
cout << "16v, 32op: " << results( ssp( uint32_t(), uint16_t() ), nsRef ) << endl;
cout << "16v, 64op: " << results( ssp( uint64_t(), uint16_t() ), nsRef ) << endl;
cout << "32v, 32op: " << results( ssp( uint32_t(), uint32_t() ), nsRef ) << endl;
cout << "32v, 64op: " << results( ssp( uint64_t(), uint32_t() ), nsRef ) << endl;
cout << "64v, 64op: " << results( ssp( uint64_t(), uint64_t() ), nsRef ) << endl;
}
MASM:
; uint64_t divLoop( uint64_t *dividends, uint64_t *divisors, size_t n, size_t rounds );
PUBLIC ?divLoop@@YA_KPEA_K0_K1@Z
; uint32_t divLoop( uint32_t *dividends, uint32_t *divisors, size_t n, size_t rounds );
PUBLIC ?divLoop@@YAIPEAI0_K1@Z
; uint16_t divLoop( uint16_t *dividends, uint16_t *divisors, size_t n, size_t rounds );
PUBLIC ?divLoop@@YAGPEAG0_K1@Z
; uint8_t divLoop( uint8_t *dividends, uint8_t *divisors, size_t n, size_t rounds );
PUBLIC ?divLoop@@YAEPEAE0_K1@Z
_TEXT SEGMENT
; rcx = dividends
; rdx = divisors
; r8 = n
; r9 = rounds
?divLoop@@YA_KPEA_K0_K1@Z PROC
push r12
push r13
mov r10, rdx
sub r13, r13
outerLoop:
mov r11, r8
cmp r8, r9
cmova r11, r9
sub r11, 1
jc byebye
innerLoop:
mov rax, [rcx + r11 * 8]
mov r12, [r10 + r11 * 8]
mov rdx, r13
div r12
mov r13, rax
and r13, 0
sub r11, 1
jnc innerLoop
sub r9, r8
ja outerLoop
byebye:
pop r13
pop r12
ret
?divLoop@@YA_KPEA_K0_K1@Z ENDP
?divLoop@@YAIPEAI0_K1@Z PROC
push r12
push r13
mov r10, rdx
sub r13, r13
outerLoop:
mov r11, r8
cmp r8, r9
cmova r11, r9
sub r11, 1
jc byebye
innerLoop:
mov eax, [rcx + r11 * 4]
mov r12d, [r10 + r11 * 4]
mov edx, r13d
div r12d
mov r13d, eax
and r13d, 0
sub r11, 1
jnc innerLoop
sub r9, r8
ja outerLoop
byebye:
pop r13
pop r12
ret
?divLoop@@YAIPEAI0_K1@Z ENDP
?divLoop@@YAGPEAG0_K1@Z PROC
push r12
push r13
mov r10, rdx
sub r13, r13
outerLoop:
mov r11, r8
cmp r8, r9
cmova r11, r9
sub r11, 1
jc byebye
innerLoop:
mov ax, [rcx + r11 * 2]
mov r12w, [r10 + r11 * 2]
mov dx, r13w
div r12w
mov r13w, ax
and r13w, 0
sub r11, 1
jnc innerLoop
sub r9, r8
ja outerLoop
byebye:
pop r13
pop r12
ret
?divLoop@@YAGPEAG0_K1@Z ENDP
?divLoop@@YAEPEAE0_K1@Z PROC
push r12
push r13
mov r10, rdx
sub r13, r13
outerLoop:
mov r11, r8
cmp r8, r9
cmova r11, r9
sub r11, 1
jc byebye
innerLoop:
mov al, [rcx + r11]
mov r12b, [r10 + r11]
mov dl, r13b
mov ah, dl
div r12b
mov r13b, al
and r13b, 0
sub r11, 1
jnc innerLoop
sub r9, r8
ja outerLoop
byebye:
pop r13
pop r12
ret
?divLoop@@YAEPEAE0_K1@Z ENDP
_TEXT ENDS
END
Now this code perfectly demonstrates that divisions aren't dependent on the register-width but the operand-width. But I'm still asking myself why a 64 (operand and register) by 64 division is only 1/3 slower than an 8 by 8 divsion.