You can try this log(x)
replacement I wrote using SSE2 intrinsics:
#include <assert.h>
#include <immintrin.h>
static __m128i EXPONENT_MASK;
static __m128i EXPONENT_BIAS;
static __m128i EXPONENT_ZERO;
static __m128d FIXED_SCALE;
static __m128d LOG2ERECIP;
static const int EXPONENT_SHIFT = 52;
// Required to initialize constants.
void sselog_init() {
EXPONENT_MASK = _mm_set1_epi64x(0x7ff0000000000000UL);
EXPONENT_BIAS = _mm_set1_epi64x(0x00000000000003ffUL);
EXPONENT_ZERO = _mm_set1_epi64x(0x3ff0000000000000UL);
FIXED_SCALE = _mm_set1_pd(9.31322574615478515625e-10); // 2^-30
LOG2ERECIP = _mm_set1_pd(0.693147180559945309417232121459); // 1/log2(e)
}
// Extract IEEE754 double exponent as integer.
static inline __m128i extractExponent(__m128d x) {
return
_mm_sub_epi64(
_mm_srli_epi64(
_mm_and_si128(_mm_castpd_si128(x), EXPONENT_MASK),
EXPONENT_SHIFT),
EXPONENT_BIAS);
}
// Set IEEE754 double exponent to zero.
static inline __m128d clearExponent(__m128d x) {
return
_mm_castsi128_pd(
_mm_or_si128(
_mm_andnot_si128(
EXPONENT_MASK,
_mm_castpd_si128(x)),
EXPONENT_ZERO));
}
// Compute log(x) using SSE2 intrinsics to >= 30 bit precision, except denorms.
double sselog(double x) {
assert(x >= 2.22507385850720138309023271733e-308); // no denormalized
// Two independent logarithms could be computed by initializing
// base with two different values, either with independent
// arguments to _mm_set_pd() or from contiguous memory with
// _mm_load_pd(). No other changes should be needed other than to
// extract both results at the end of the function (or just return
// the packed __m128d).
__m128d base = _mm_set_pd(x, x);
__m128i iLog = extractExponent(base);
__m128i fLog = _mm_setzero_si128();
base = clearExponent(base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
// fLog = _mm_slli_epi64(fLog, 10); // Not needed first time through.
fLog = _mm_or_si128(extractExponent(base), fLog);
base = clearExponent(base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
fLog = _mm_slli_epi64(fLog, 10);
fLog = _mm_or_si128(extractExponent(base), fLog);
base = clearExponent(base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
fLog = _mm_slli_epi64(fLog, 10);
fLog = _mm_or_si128(extractExponent(base), fLog);
// No _mm_cvtepi64_pd() exists so use _mm_cvtepi32_pd() conversion.
iLog = _mm_shuffle_epi32(iLog, 0x8);
fLog = _mm_shuffle_epi32(fLog, 0x8);
__m128d result = _mm_mul_pd(_mm_cvtepi32_pd(fLog), FIXED_SCALE);
result = _mm_add_pd(result, _mm_cvtepi32_pd(iLog));
// Convert from base 2 logarithm and extract result.
result = _mm_mul_pd(result, LOG2ERECIP);
return ((double *)&result)[0]; // other output in ((double *)&result)[1]
}
The code implements an algorithm described in this Texas Instruments brief, of repeatedly squaring the argument and concatenating exponent bits. It will not work with denormalized inputs. It provides at least 30 bits of precision.
This runs faster than log()
on one of my machines and slower on the other, so your mileage may vary; I don't claim this is necessarily the best approach. This code, however, actually computes two logarithms in parallel using both halves of a 128-bit SSE2 word (although the function as-is only returns one result), so it could be adapted into one building block of a SIMD calculation of your entire function (and I think log
is the difficult part as cos
is pretty well-behaved). Furthermore, your processor supports AVX which can pack 4 double-precision elements into a 256-bit word, and extending this code to AVX should be straightforward.
If you choose not to go full SIMD, you could still use both logarithm slots by pipelining - i.e. compute log(w2*cos(w1)/(M_PI_2-w1))
for the current iteration with log(u2)
for the next iteration.
Even if this function benchmarks slower against log
in isolation, it may still be worth testing with your actual function. This code does not stress the data cache at all so it may be friendlier with other code that does (e.g. with a cos
that uses lookup tables). Also microinstruction dispatch could be improved (or not) with surrounding code, depending on whether other code uses SSE.
My other advice (repeated from the comments) would be:
- Try
-march=native -mtune=native
to get gcc to optimize for your CPU.
- Avoid calling both
tan
and cos
on the same argument - use sincos
or a trig identity.
- Consider using a GPU (e.g. OpenCL).
It seems like it's better to compute sin
instead of cos
- the reason is that you can use it for tan_w1 = sin_w1/sqrt(1.0 - sin_w1*sin_w1)
. Using cos
, which I originally suggested, loses the proper sign when computing tan
. And it looks like you can get a good speedup by using a minimax polynomial over [-pi/2, pi/2] as other answerers have said. The 7-term function at this link (make sure you get minimaxsin
, not taylorsin
) seems to work quite well.
So here's your program rewritten with all SSE2 trancendental approximations:
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include "mtwist.h"
#if defined(__AVX__)
#define VECLEN 4
#elif defined(__SSE2__)
#define VECLEN 2
#else
#error // No SIMD available.
#endif
#if VECLEN == 4
#define VBROADCAST(K) { K, K, K, K };
typedef double vdouble __attribute__((vector_size(32)));
typedef long vlong __attribute__((vector_size(32)));
#elif VECLEN == 2
#define VBROADCAST(K) { K, K };
typedef double vdouble __attribute__((vector_size(16)));
typedef long vlong __attribute__((vector_size(16)));
#endif
static const vdouble FALLBACK_THRESHOLD = VBROADCAST(1.0 - 0.001);
vdouble sse_sin(vdouble x) {
static const vdouble a0 = VBROADCAST(1.0);
static const vdouble a1 = VBROADCAST(-1.666666666640169148537065260055e-1);
static const vdouble a2 = VBROADCAST( 8.333333316490113523036717102793e-3);
static const vdouble a3 = VBROADCAST(-1.984126600659171392655484413285e-4);
static const vdouble a4 = VBROADCAST( 2.755690114917374804474016589137e-6);
static const vdouble a5 = VBROADCAST(-2.502845227292692953118686710787e-8);
static const vdouble a6 = VBROADCAST( 1.538730635926417598443354215485e-10);
vdouble xx = x*x;
return x*(a0 + xx*(a1 + xx*(a2 + xx*(a3 + xx*(a4 + xx*(a5 + xx*a6))))));
}
static inline vlong shiftRight(vlong x, int bits) {
#if VECLEN == 4
__m128i lo = (__m128i)_mm256_extractf128_si256((__m256i)x, 0);
__m128i hi = (__m128i)_mm256_extractf128_si256((__m256i)x, 1);
return (vlong)
_mm256_insertf128_si256(
_mm256_castsi128_si256(_mm_srli_epi64(lo, bits)),
_mm_srli_epi64(hi, bits),
1);
#elif VECLEN == 2
return (vlong)_mm_srli_epi64((__m128i)x, bits);
#endif
}
static inline vlong shiftLeft(vlong x, int bits) {
#if VECLEN == 4
__m128i lo = (__m128i)_mm256_extractf128_si256((__m256i)x, 0);
__m128i hi = (__m128i)_mm256_extractf128_si256((__m256i)x, 1);
return (vlong)
_mm256_insertf128_si256(
_mm256_castsi128_si256(_mm_slli_epi64(lo, bits)),
_mm_slli_epi64(hi, bits),
1);
#elif VECLEN == 2
return (vlong)_mm_slli_epi64((__m128i)x, bits);
#endif
}
static const vlong EXPONENT_MASK = VBROADCAST(0x7ff0000000000000L);
static const vlong EXPONENT_BIAS = VBROADCAST(0x00000000000003ffL);
static const vlong EXPONENT_ZERO = VBROADCAST(0x3ff0000000000000L);
static const vdouble FIXED_SCALE = VBROADCAST(9.31322574615478515625e-10); // 2^-30
static const vdouble LOG2ERECIP = VBROADCAST(0.6931471805599453094172);
static const int EXPONENT_SHIFT = 52;
// Extract IEEE754 double exponent as integer.
static inline vlong extractExponent(vdouble x) {
return shiftRight((vlong)x & EXPONENT_MASK, EXPONENT_SHIFT) - EXPONENT_BIAS;
}
// Set IEEE754 double exponent to zero.
static inline vdouble clearExponent(vdouble x) {
return (vdouble)(((vlong)x & ~EXPONENT_MASK) | EXPONENT_ZERO);
}
// Compute log(x) using SSE2 intrinsics to >= 30 bit precision, except
// denorms.
vdouble sse_log(vdouble base) {
vlong iLog = extractExponent(base);
vlong fLog = VBROADCAST(0);
base = clearExponent(base);
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
fLog = shiftLeft(fLog, 10);
fLog |= extractExponent(base);
base = clearExponent(base);
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
fLog = shiftLeft(fLog, 10);
fLog |= extractExponent(base);
base = clearExponent(base);
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
fLog = shiftLeft(fLog, 10);
fLog |= extractExponent(base);
// No _mm_cvtepi64_pd() exists so use 32-bit conversion to double.
#if VECLEN == 4
__m128i iLogLo = _mm256_extractf128_si256((__m256i)iLog, 0);
__m128i iLogHi = _mm256_extractf128_si256((__m256i)iLog, 1);
iLogLo = _mm_srli_si128(_mm_shuffle_epi32(iLogLo, 0x80), 8);
iLogHi = _mm_slli_si128(_mm_shuffle_epi32(iLogHi, 0x08), 8);
__m128i fLogLo = _mm256_extractf128_si256((__m256i)fLog, 0);
__m128i fLogHi = _mm256_extractf128_si256((__m256i)fLog, 1);
fLogLo = _mm_srli_si128(_mm_shuffle_epi32(fLogLo, 0x80), 8);
fLogHi = _mm_slli_si128(_mm_shuffle_epi32(fLogHi, 0x08), 8);
vdouble result = _mm256_cvtepi32_pd(iLogHi | iLogLo) +
FIXED_SCALE*_mm256_cvtepi32_pd(fLogHi | fLogLo);
#elif VECLEN == 2
iLog = (vlong)_mm_shuffle_epi32((__m128i)iLog, 0x8);
fLog = (vlong)_mm_shuffle_epi32((__m128i)fLog, 0x8);
vdouble result = _mm_cvtepi32_pd((__m128i)iLog) +
FIXED_SCALE*_mm_cvtepi32_pd((__m128i)fLog);
#endif
// Convert from base 2 logarithm and extract result.
return LOG2ERECIP*result;
}
// Original computation.
double fallback(double u1, double u2) {
double w1 = M_PI*(u1-1/2.0);
double w2 = -log(u2);
return tan(w1)*(M_PI_2-w1)+log(w2*cos(w1)/(M_PI_2-w1));
}
int main() {
static const vdouble ZERO = VBROADCAST(0.0)
static const vdouble ONE = VBROADCAST(1.0);
static const vdouble ONE_HALF = VBROADCAST(0.5);
static const vdouble PI = VBROADCAST(M_PI);
static const vdouble PI_2 = VBROADCAST(M_PI_2);
int i,j;
vdouble x = ZERO;
for(i = 0; i < 100000000; i += VECLEN) {
vdouble u1, u2;
for (j = 0; j < VECLEN; ++j) {
((double *)&u1)[j] = mt_drand();
((double *)&u2)[j] = mt_drand();
}
vdouble w1 = PI*(u1 - ONE_HALF);
vdouble w2 = -sse_log(u2);
vdouble sin_w1 = sse_sin(w1);
vdouble sin2_w1 = sin_w1*sin_w1;
#if VECLEN == 4
int nearOne = _mm256_movemask_pd(sin2_w1 >= FALLBACK_THRESHOLD);
#elif VECLEN == 2
int nearOne = _mm_movemask_pd(sin2_w1 >= FALLBACK_THRESHOLD);
#endif
if (!nearOne) {
#if VECLEN == 4
vdouble cos_w1 = _mm256_sqrt_pd(ONE - sin2_w1);
#elif VECLEN == 2
vdouble cos_w1 = _mm_sqrt_pd(ONE - sin2_w1);
#endif
vdouble tan_w1 = sin_w1/cos_w1;
x += tan_w1*(PI_2 - w1) + sse_log(w2*cos_w1/(PI_2 - w1));
}
else {
vdouble result;
for (j = 0; j < VECLEN; ++j)
((double *)&result)[j] = fallback(((double *)&u1)[j], ((double *)&u2)[j]);
x += result;
}
}
double sum = 0.0;
for (i = 0; i < VECLEN; ++i)
sum += ((double *)&x)[i];
printf("%lf\n", sum);
return 0;
}
I ran into one annoying problem - the sin
approximation error near ±pi/2 can put the value slightly outside [-1, 1] and that (1) causes the computation of tan
to be invalid and (2) causes outsized effects when the log argument is near 0. To avoid this the code tests if sin(w1)^2
is "close" to 1 and if it is then it falls back to the original full double precision path. The definition of "close" is in FALLBACK_THRESHOLD
at the top of the program - I arbitrarily set it to 0.999 which still appears to return values in the range of the OP's original program but has little effect on performance.
I've edited the code to use gcc-specific syntax extensions for SIMD. If your compiler doesn't have these extensions then you can go back through the edit history. The code now uses AVX if enabled in the compiler to process 4 doubles at a time (instead of 2 doubles with SSE2).
The results on my machine without calling mt_seed()
to get a repeatable result are:
Version Time Result
original 14.653 secs -1917488837.945067
SSE 7.380 secs -1917488837.396841
AVX 6.271 secs -1917488837.422882
It makes sense that the SSE/AVX results are different from the original result because of the trancendental approximations. I think you should be able to tweak FALLBACK_THRESHOLD
to trade off precision and speed. I'm not sure why the SSE and AVX results are slightly different from each other.