I'm sure the other answers are better than mine - running a very quick and dirty polynomial approximation, I end up with these.
inline __m512d exp2(const __m512d x) {
const __m512d a = _mm512_set1_pd(0.000217549227054);
const __m512d b = _mm512_set1_pd(0.00124218531444);
const __m512d c = _mm512_set1_pd(0.00968102455999);
const __m512d d = _mm512_set1_pd(0.0554821818101);
const __m512d e = _mm512_set1_pd(0.240230073528);
const __m512d f = _mm512_set1_pd(0.693146979806);
const __m512d g = _mm512_set1_pd(1.0);
const __m512d fx = _mm512_floor_pd(x); // integer part
const __m512d X = _mm512_sub_pd(x, fx); // fractional part
__m512d y = _mm512_fmadd_pd(a, X, b);
y = _mm512_fmadd_pd(y, X, c);
y = _mm512_fmadd_pd(y, X, d);
y = _mm512_fmadd_pd(y, X, e);
y = _mm512_fmadd_pd(y, X, f);
y = _mm512_fmadd_pd(y, X, g); // polynomial approximation over [0,1)
return _mm512_scalef_pd(y, fx); // scale by 2^integer
}
inline __m512d exp(const __m512d x) {
return exp2(_mm512_mul_pd(x, _mm512_set1_pd(1.442695040888963387)));
}
inline __m512d log2(const __m512d x) {
const __m512d m = _mm512_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
const __m512d a = _mm512_set1_pd(0.0146498917256);
const __m512d b = _mm512_set1_pd(-0.178725976271);
const __m512d c = _mm512_set1_pd(0.953841083567);
const __m512d d = _mm512_set1_pd(-2.92298892586);
const __m512d e = _mm512_set1_pd(5.68725545823);
const __m512d f = _mm512_set1_pd(-7.4092580291);
const __m512d g = _mm512_set1_pd(7.09194627711);
const __m512d h = _mm512_set1_pd(-3.23671917705);
__m512d y = _mm512_fmadd_pd(a, m, b);
y = _mm512_fmadd_pd(y, m, c);
y = _mm512_fmadd_pd(y, m, d);
y = _mm512_fmadd_pd(y, m, e);
y = _mm512_fmadd_pd(y, m, f);
y = _mm512_fmadd_pd(y, m, g);
y = _mm512_fmadd_pd(y, m, h); // poly approximation over [1,2) mantissa
return _mm512_add_pd(y, _mm512_getexp_pd(x));
}
inline __m512d log(const __m512d x) {
return _mm512_mul_pd(log2(x), _mm512_set1_pd(0.693147180559945286));
}
Out-of-order execution across independent exp2()
or log2()
operations can handle the FMA dependency chains from using Horner's rule for the 6th-order polynomials.
See also Agner Fog's VCL implementations which aim for high precision, close to full precision of double
:
double-precision exp_d
template, supporting a few bases including 2.0
, and exp(x-1)
vs. exp(x)
. (See the exp2
caller for the right template params).
Uses a 13th-order Taylor series, which comments in the code indicate is faster than the alternate version, using a Pade expansion: a ratio of two polynomials. One division per many FMAs isn't a disaster for throughput, especially if you have lots surrounding code that also does some non-division work with each result, but doing just this might be too much division per FMA.
double-precision log_d
template. This does use a ratio of 5th-order polynomials for the mantissa. Template params support log(x)
vs. log(x+1)
to avoid losing precision. It only does natural log (base e
), so you'll need to scale the result by 1/ln(2)
.