When I run fma-optimized horner-scheme polynomial computation (for cosine approximation), it makes 0.161 ulps error on FX8150 but 0.154 ulps on godbolt.org server despite the lack of -ffast-math (GCC).
If this is caused by hardware, and if accuracy is different per hardware, what does a C++ compiler do to maintain floating-point accuracy between different machines?
Is there only a minimum accuracy requirement by programming language specs so that any cpu vendor can boost precision as high as they want?
Minimal reproducible sample:
#include<iostream>
// only optimized for [-1,1] input range
template<typename Type, int Simd>
inline
void cosFast(Type * const __restrict__ data, Type * const __restrict__ result) noexcept
{
alignas(64)
Type xSqr[Simd];
for(int i=0;i<Simd;i++)
{
xSqr[i] = data[i]*data[i];
}
for(int i=0;i<Simd;i++)
{
result[i] = Type(2.425144155360214881511638e-05);
}
for(int i=0;i<Simd;i++)
{
result[i] = result[i]*xSqr[i] + Type(-0.001388599083010255696990498);
}
for(int i=0;i<Simd;i++)
{
result[i] = result[i]*xSqr[i] + Type(0.04166657759826541962411284);
}
for(int i=0;i<Simd;i++)
{
result[i] = result[i]*xSqr[i] + Type(-0.4999999436679569697616898);
}
for(int i=0;i<Simd;i++)
{
result[i] = result[i]*xSqr[i] + Type(0.9999999821855363180134191);
}
}
#include<cstring>
template<typename T>
uint32_t GetUlpDifference(T a, T b)
{
uint32_t aBitValue;
uint32_t bBitValue;
std::memcpy(&aBitValue,&a,sizeof(T));
std::memcpy(&bBitValue,&b,sizeof(T));
return (aBitValue > bBitValue) ?
(aBitValue - bBitValue) :
(bBitValue - aBitValue);
}
#include<vector>
template<typename Type>
float computeULP(std::vector<Type> real, std::vector<Type> approximation)
{
int ctr = 0;
Type diffSum = 0;
for(auto r:real)
{
Type diff = GetUlpDifference(r,approximation[ctr++]);
diffSum += diff;
}
return diffSum/ctr;
}
template<typename Type>
float computeMaxULP(std::vector<Type> real, std::vector<Type> approximation)
{
int ctr = 0;
Type mx = 0;
int index = -1;
Type rr = 0;
Type aa = 0;
for(auto r:real)
{
Type diff = GetUlpDifference(r,approximation[ctr++]);
if(mx<diff)
{
mx = diff;
rr=r;
aa=approximation[ctr-1];
index = ctr-1;
}
}
std::cout<<"("<<index<<":"<<rr<<"<-->"<<aa<<")";
return mx;
}
#include<cmath>
void test()
{
constexpr int n = 8192*64;
std::vector<float> a(n),b(n),c(n);
for(int i=0;i<n;i++)
a[i]=(i-(n/2))/(float)(n/2);
// approximation
for(int i=0;i<n;i+=16)
cosFast<float,16>(a.data()+i,b.data()+i);
// exact
for(int i=0;i<n;i++)
c[i] = std::cos(a[i]);
std::cout<<"avg. ulps: "<<computeULP(b,c)<<std::endl;
std::cout<<"max. ulps: "<<computeMaxULP(b,c)<<std::endl;
}
int main()
{
test();
return 0;
}
proof that it uses FMA:
https://godbolt.org/z/Y4qYMoxcn
.L23:
vmovups ymm3, YMMWORD PTR [r12+rax]
vmovups ymm2, YMMWORD PTR [r12+32+rax]
vmulps ymm3, ymm3, ymm3
vmulps ymm2, ymm2, ymm2
vmovaps ymm1, ymm3
vmovaps ymm0, ymm2
vfmadd132ps ymm1, ymm7, ymm8
vfmadd132ps ymm0, ymm7, ymm8
vfmadd132ps ymm1, ymm6, ymm3
vfmadd132ps ymm0, ymm6, ymm2
vfmadd132ps ymm1, ymm5, ymm3
vfmadd132ps ymm0, ymm5, ymm2
vfmadd132ps ymm1, ymm4, ymm3
vfmadd132ps ymm0, ymm4, ymm2
vmovups YMMWORD PTR [r13+0+rax], ymm1
vmovups YMMWORD PTR [r13+32+rax], ymm0
add rax, 64
cmp rax, 2097152
jne .L23
this instance (I don't know if it was xeon or epyc) further improved it to 0.152 ulps average.