It seems like one has to create own custom emulator like below sample. (direct bitwise-conversion (in intMagic
function) from an integer to float gives partially right answer for low-range integers like 1 to 1023 or up to mantissa bits, but still evades 1 conversion/casting latency) (this is just as a sample, not for exact work):
https://godbolt.org/z/x4c5dGndz
#include<cstdint>
#include<cmath>
#include<cstring>
#include <stdint.h> // <cstdint> is preferred in C++, but stdint.h works.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
// optional wrapper if you don't want to just use __rdtsc() everywhere
inline
uint64_t readTSC() {
// _mm_lfence(); // optionally wait for earlier insns to retire before reading the clock
uint64_t tsc = __rdtsc();
// _mm_lfence(); // optionally block later instructions until rdtsc retires
return tsc;
}
void intTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/val2[i]; // scalar idiv
}
}
void intEmulationTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
double v1 = val1[i];
double v2 = val2[i];
double v3 = v1/v2;
double t = v3 - (uint32_t)v3;
v3 += t<0.99?0.01:0.0;
val3[i] = v3; // 42-instruction code-bloat 2x faster than 1 idiv >:c
}
}
// writing bits of integer
// directly to bits of mantissa
// up to 23 bits shoul be ok
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
float v1;
float v2;
std::memcpy(
&v1, //mantissa dest
&val1[i], //23 least significant bits src
sizeof(float) // write all bytes anyway. Assume float is 4 bytes as uint32_t!
);
std::memcpy(&v2,&val2[i],sizeof(float));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
// writing bits of 32 integer (but in 64bit storage)
// directly to bits of mantissa of double (53 bits enough?)
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTestDouble(uint64_t * const __restrict__ val1, uint64_t * const __restrict__ val2, uint64_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
double v1;
double v2;
std::memcpy(
&v1, //mantissa dest
&val1[i], //53 least significant bits src
sizeof(double) // write all bytes anyway. Assume float is 8 bytes as uint64_t!
);
std::memcpy(&v2,&val2[i],sizeof(double));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
// writing bits of 32 integer (using temporary 64bit storage)
// directly to bits of mantissa of double (53 bits enough?)
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTestDoubleTmp(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
uint64_t tmp1 = val1[i];
uint64_t tmp2 = val2[i];
double v1;
double v2;
std::memcpy(
&v1, //mantissa dest
&tmp1, //53 least significant bits src
sizeof(double) // write all bytes anyway. Assume float is 8 bytes as uint64_t!
);
std::memcpy(&v2,&tmp2,sizeof(double));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
#include <iostream>
int main()
{
uint32_t a[1024],b[1024],c[1024];
for(int i=0;i<1024;i++)
{
a[i]=1+i*i; b[i]=1+i;
}
uint64_t a64[1024],b64[1024],c64[1024];
for(int i=0;i<1024;i++)
{
a64[i]=1+i*i; b64[i]=1+i;
}
std::cout<<"emulation:"<<std::endl;
auto t1 = readTSC() ;
intEmulationTest(a,b,c);
auto t2 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic:"<<std::endl;
auto t3 = readTSC() ;
intMagicTest(a,b,c);
auto t4 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"int:"<<std::endl;
auto t5 = readTSC() ;
intTest(a,b,c);
auto t6 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic double:"<<std::endl;
auto t7 = readTSC() ;
intMagicTestDouble(a64,b64,c64);
auto t8 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic double tmp:"<<std::endl;
auto t9 = readTSC() ;
intMagicTestDoubleTmp(a,b,c);
auto t10 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"emulation: "<<t2-t1<<" cycles"<<std::endl;
std::cout<<"magic: "<<t4-t3<<" cycles"<<std::endl;
std::cout<<"int: "<<t6-t5<<" cycles"<<std::endl;
std::cout<<"magic double: "<<t8-t7<<" cycles"<<std::endl;
std::cout<<"magic double tmp: "<<t10-t9<<" cycles"<<std::endl;
return 0;
}
output on godbolt.org:
emulation: 7784 cycles <-- should be ok for positive values only, needs more corner-case checking maybe
magic: 1708 cycles <-- not performance-portable (denormals), only 23 bits
int: 16576 cycles
magic double: 11844 cycles <-- not performance-portable
magic double tmp: 5432 cycles <-- not performance-portable
To make up for the conversion overhead & bit-level hacking, the SIMD hardware needs to be much wider like 8192 bits or 16384 bits and maybe only then it becomes more appealing for compilers to vectorize the unknown integer / unknown integer division through FP SIMD (100 instructions to check every corner case but running on 256 pipelines ~2.5x speedup).
GPU hardware has 32 pipelines per warp/wavefront and up to 192 pipelines per core. Maybe it is usable in there but looks like it's not much of a gain for x86 CPU even with AVX512 (at least for general-purpose use that requires full 32bit precision). For very low precision integer math, one can simply use floats everywhere too (assuming that corner cases are not a problem).
CPU Type: AMD EPYC 7R32 (GCC v11)
emulation: 8260 cycles
magic: 1904 cycles
int: 15708 cycles (this was compiled with uint64_t)
magic double: 12544 cycles
magic double tmp: 6188 cycles
CPU Type: AMD FX(tm)-8150 Eight-Core Processor (GCC v10)
emulation: 20687 cycles
magic: 67583 cycles
int: 32914 cycles
int: 31135 cycles (this was compiled with uint64_t)
magic double: 615307 cycles
magic double tmp: 141889 cycles
CPU Type: Intel(R) Xeon(R) E-2286G CPU @ 4.00GHz
emulation: 9964 cycles
magic: 138052 cycles
int: 6477 cycles
int: 19016 cycles (this was compiled with uint64_t)
magic double: 141443 cycles
magic double tmp: 137180 cycles
CPU Type: Intel(R) Xeon(R) CPU E3-1270 V2 @ 3.50GHz
emulation: 18282 cycles
magic: 210486 cycles
int: 14436 cycles
int: 33604 cycles (this was compiled with uint64_t)
magic double: 225920 cycles
magic double tmp: 217520 cycles
CPU Type: Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
emulation: 39483 cycles
magic: 153666 cycles
int: 33746 cycles (this was compiled with uint64_t)
magic double: 158076 cycles
magic double tmp: 159813 cycles
CPU Type: AMD Opteron(tm) Processor 4332 HE
emulation: 18633 cycles
magic: 114682 cycles
int: 16280 cycles
int: 31070 cycles (this was compiled with uint64_t)
magic double: 504295 cycles
magic double tmp: 104919 cycles
CPU Type: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
emulation: 3448 cycles <--- avx512:
magic: 13296 cycles
int: 7676 cycles
int: 84110 cycles (this was compiled with uint64_t)
magic double: 178162 cycles
magic double tmp: 27662 cycles