Sample cosine approximation:
- 0.15 ULPS average (1 ULPS max) error
- 12x speedup for AVX512, 8-9x for AVX2
approximation of cosine over [-1,1] range.
Polynomial coefficients were found by a genetic algorithm. The multiplication series look like Chebyshev Polynomials but they are not, but needed when you need a wider range of input rather than just [-1,1].
// only optimized for [-1,1] input range!!
template<typename Type, int Simd>
inline
void cosFast(
Type * const __restrict__ data,
Type * const __restrict__ result) noexcept
{
alignas(64)
Type xSqr[Simd];
alignas(64)
Type xSqrSqr[Simd];
alignas(64)
Type xSqrSqrSqr[Simd];
alignas(64)
Type xSqrSqrSqrSqr[Simd];
#pragma GCC ivdep
for(int i=0;i<Simd;i++)
{
xSqr[i] = data[i]*data[i];
}
#pragma GCC ivdep
for(int i=0;i<Simd;i++)
{
xSqrSqr[i] = xSqr[i]*xSqr[i];
}
#pragma GCC ivdep
for(int i=0;i<Simd;i++)
{
xSqrSqrSqr[i] = xSqrSqr[i]*xSqr[i];
}
#pragma GCC ivdep
for(int i=0;i<Simd;i++)
{
xSqrSqrSqrSqr[i] = xSqrSqr[i]*xSqrSqr[i];
}
#pragma GCC ivdep
for(int i=0;i<Simd;i++)
{
result[i] = Type(2.37711074060342753000441e-05)*xSqrSqrSqrSqr[i] +
Type(-0.001387712893937020908197155)*xSqrSqrSqr[i] +
Type(0.04166611039514833692010143)*xSqrSqr[i] +
Type(-0.4999998698566363586337502)*xSqr[i] +
Type(0.9999999941252593060880827);
}
}
If you need to use a wider range, then you should do a range-reduction at high precision and use something like this:
- range reduce to -pi,pi at high precision
- divide by "4" to put it in -1,1 range
- compute same series as above => tmp
- compute (Chebyshev) L_"4" (tmp) = result
.L29:
vmovups ymm7, YMMWORD PTR [r14+rax]
vmulps ymm1, ymm7, ymm7
vmovups ymm7, YMMWORD PTR [r14+32+rax]
vmulps ymm3, ymm1, ymm1
vmulps ymm6, ymm3, ymm3
vmulps ymm6, ymm6, YMMWORD PTR .LC11[rip]
vmulps ymm0, ymm7, ymm7
vmulps ymm5, ymm3, ymm1
vfmadd132ps ymm5, ymm6, YMMWORD PTR .LC12[rip]
vmulps ymm2, ymm0, ymm0
vmulps ymm6, ymm2, ymm2
vmulps ymm6, ymm6, YMMWORD PTR .LC11[rip]
vfmadd132ps ymm3, ymm5, YMMWORD PTR .LC13[rip]
vmulps ymm4, ymm2, ymm0
vfmadd132ps ymm4, ymm6, YMMWORD PTR .LC12[rip]
vfmadd132ps ymm1, ymm3, YMMWORD PTR .LC14[rip]
vfmadd132ps ymm2, ymm4, YMMWORD PTR .LC13[rip]
vaddps ymm1, ymm1, YMMWORD PTR .LC15[rip]
vfmadd132ps ymm0, ymm2, YMMWORD PTR .LC14[rip]
vaddps ymm0, ymm0, YMMWORD PTR .LC15[rip]
vmovups YMMWORD PTR [r15+rax], ymm1
vmovups YMMWORD PTR [r15+32+rax], ymm0
add rax, 64
cmp rax, 16777216
jne .L29