5

Simple implementations of acosf() can easily achieve an error bound of 1.5 ulp with respect to the infinitely precise (mathematical) result, if the platform has support for fused multiply-add (FMA). This means results never differ by more than one ulp from the correctly rounded result in round-to-nearest-or-even mode.

However, such an implementation usually comprises two major code branches that split the primary approximation interval [0,1] roughly in half, as in the exemplary code below. This branchyness inhibits automatic vectorization by compilers when targeting SIMD architectures.

Is there an alternative algorithmic approach that lends itself more readily to automatic vectorization, while maintaining the same error bound of 1.5 ulps? Platform support for FMA can be assumed.

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
njuffa
  • 23,970
  • 4
  • 78
  • 130

3 Answers3

4

The closest I have come to a satisfactory solution is based on an idea from a news posting by Robert Harley, in which he observed that for x in [0,1], acos(x) ≈ √(2*(1-x)), and that a polynomial can provide the scale factor necessary to make this an accurate approximation across the entire interval. As can be seen in the code below, this approach results in straight-line code with just two uses of the ternary operator to handle arguments in the negative half-plane.

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define VECTORIZABLE 1
#define ARR_LEN      (1 << 24)
#define MAX_ULP      1 /* deviation from correctly rounded result */

#if VECTORIZABLE  
/* 
 Compute arccos(a) with a maximum error of 1.496766 ulp 
 This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
 https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}

#else // VECTORIZABLE

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
#endif // VECTORIZABLE

int main (void)
{
    double darg, dref;
    float ref, *a, *b;
    uint32_t argi, resi, refi;

    printf ("%svectorizable implementation of acos\n", 
            VECTORIZABLE ? "" : "non-");

    a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
    b = (float *)malloc (sizeof(b[0]) * ARR_LEN);

    argi = 0x00000000;
    do {

        for (int i = 0; i < ARR_LEN; i++) {
            memcpy (&a[i], &argi, sizeof(a[i]));
            argi++;
        }

        for (int i = 0; i < ARR_LEN; i++) {
            b[i] = my_acosf (a[i]);
        }

        for (int i = 0; i < ARR_LEN; i++) {
            darg = (double)a[i];
            dref = acos (darg);
            ref = (float)dref;
            memcpy (&refi, &ref, sizeof(refi));
            memcpy (&resi, &b[i], sizeof(resi));
            if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
                printf ("error > 1 ulp a[i]=% 14.6a  b[i]=% 14.6a  ref=% 14.6a  dref=% 21.13a\n", 
                        a[i], b[i], ref, dref);
                printf ("test FAILED\n");

                return EXIT_FAILURE;
            }
        }

        printf ("^^^^ argi = %08x\n", argi);
    } while (argi);

    printf ("test PASSED\n");

    free (a);
    free (b);

    return EXIT_SUCCESS;
}

Although the structure of this code appears conducive to auto-vectorization, I have not had much luck when targeting AVX2 with the compilers offered by Compiler Explorer. The only compiler that appears to be able to vectorize this code in the context of the inner loop of my test app above is Clang. But Clang seems to be able to do this only if I specify -ffast-math, which, however, has the undesired side-effect of turning the sqrtf() invocation into an approximate square root computed via rsqrt. I experimented with some less intrusive switches, e.g. -fno-honor-nans, -fno-math-errno, -fno-trapping-math, but my_acosf() did not vectorize even when I used those in combination.

For now I have resorted to manual translation of the above code into AVX2 + FMA intrinsics, like so:

#include "immintrin.h"

/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
    const __m256 zero= _mm256_set1_ps ( 0.0f);
    const __m256 two = _mm256_set1_ps ( 2.0f);
    const __m256 mtwo= _mm256_set1_ps (-2.0f);
    const __m256 c0  = _mm256_set1_ps ( 0x1.c86000p-22f); //  4.25032340e-7
    const __m256 c1  = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
    const __m256 c2  = _mm256_set1_ps ( 0x1.90c5c4p-18f); //  5.97197595e-6
    const __m256 c3  = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
    const __m256 c4  = _mm256_set1_ps ( 0x1.c3f78ap-16f); //  2.69393295e-5
    const __m256 c5  = _mm256_set1_ps ( 0x1.e8f446p-14f); //  1.16575764e-4
    const __m256 c6  = _mm256_set1_ps ( 0x1.6df072p-11f); //  6.97973708e-4
    const __m256 c7  = _mm256_set1_ps ( 0x1.3332a6p-8f);  //  4.68746712e-3
    const __m256 c8  = _mm256_set1_ps ( 0x1.555550p-5f);  //  4.16666567e-2
    const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f);  //  1.86637890e+0
    const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f);  //  1.68325555e+0
    __m256 s, r, t, m;

    s = two;
    t = mtwo;
    m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
    t = _mm256_blendv_ps (t, s, m);
    t = _mm256_fmadd_ps (x, t, s);
    s = _mm256_sqrt_ps (t);
    r = c0;
    r = _mm256_fmadd_ps (r, t, c1);
    r = _mm256_fmadd_ps (r, t, c2);
    r = _mm256_fmadd_ps (r, t, c3);
    r = _mm256_fmadd_ps (r, t, c4);
    r = _mm256_fmadd_ps (r, t, c5);
    r = _mm256_fmadd_ps (r, t, c6);
    r = _mm256_fmadd_ps (r, t, c7);
    r = _mm256_fmadd_ps (r, t, c8);
    r = _mm256_mul_ps (r, t);
    r = _mm256_fmadd_ps (r, s, s);
    t = _mm256_sub_ps (zero, r);
    t = _mm256_fmadd_ps (pi0, pi1, t);
    r = _mm256_blendv_ps (r, t, m);
    return r;
}
njuffa
  • 23,970
  • 4
  • 78
  • 130
  • 2
    Try with `-ffast-math -mtune=skylake`; clang is less aggressive about rewriting sqrt to rsqrt + Newton iteration for CPUs with better sqrtps throughput. (Or clang supports gcc's [`-mrecip=none`](https://stackoverflow.com/questions/32002277/why-does-gcc-or-clang-not-optimise-reciprocal-to-1-instruction-when-using-fast-m) to disable that optimization.) – Peter Cordes Mar 04 '18 at 02:06
  • @PeterCordes Thanks for the pointers. Right now I am fed up with auto-vectorizers, for the n-th time. Every few years I try a couple of "simple" cases with the then latest technology, only to be disappointed again. In the time it takes me to fiddle with all the potentially relevant compiler options, I am able to write the code myself using SIMD intrinsics (albeit one platform at a time, thus the desire for portable code that also vectorizes on demand). – njuffa Mar 04 '18 at 03:05
  • Auto-vectorization is very often disappointing :/ For my answer (of a branchless version of the algo in the question), I'd expect compilers to do a poor job of auto-vectorizing the blends, and definitely not spot the shuffle-LUT instead of blend optimization. – Peter Cordes Mar 04 '18 at 03:08
2

A branchless version of the code in the question is possible (with hardly any redundant work, just some compare/blends to create constants for the FMAs), but IDK if compilers will auto-vectorize it.

The main extra work is a useless sqrt / fma if all the elements had -|a| > -0.5625f, unfortunately on the critical path.


The arg to asinf_core is (r > -0.5625f) ? r : sqrtf (fmaf (0.5f, r, 0.5f)).

In parallel with that, you (or the compiler) can be blending the coefficients for the FMA on the output.

If you sacrifice the accuracy of the pi/2 constant by putting it into one float instead of creating it with 2 constant multiplicands to fmaf, you can

fmaf( condition?-1:2,  asinf_core_result,  condition ? pi/2 : 0)

So you select between two constants, or andps a constant with a SIMD compare result to conditionally zero it (x86 SSE for example).


The final fixup is based on a range-check of the original input, so again there's instruction-level parallelism between FP blends and the FMA work of asinf_core.

In fact, we can optimize this into the previous FMA on the output of asinf_core, by blending the constant inputs to that on a second condition. We want asinf_core as one of the multiplicands for it, so we can negate or not by negating the constant. (A SIMD implementation might do a_cmp = andnot( a>0.0f, a>=-1.0f), and then multiplier ^ (-0.0f & a_cmp), where multiplier was conditionally done before.

The additive constant for that FMA on the output is 0, pi/2, pi, or pi + pi/2. Given the two compare results (on a and on r=-|a| for the non-NaN case), we could maybe combine that into a 2-bit integer and use it as a shuffle control to select an FP constant from a vector of all 4 constants, e.g. using AVX vpermilps (fast in-lane shuffle with a variable control). i.e. instead of blending 4 different ways, use a shuffle as a 2-bit LUT!

If we're doing that, we should also do it for the multiplicative constant, because creating the constant is the main cost. Variable blends are more expensive than shuffles on x86 (usually 2 uops vs. 1). On Skylake, variable blends (like vblendvps) can use any port (while shuffles only run on port 5). There's enough ILP that this probably bottlenecks on overall uop throughput, or overall ALU ports, not port 5. (Variable blend on Haswell is 2 uops for port 5, so it's strictly worse than vpermilps ymm,ymm,ymm).

We'd be selecting from -1, 1, -2, and 2.


Scalar with ternary operators, auto-vectorizes (with 8 vblendvps) with gcc7.3 -O3 -march=skylake -ffast-math. fast-math required for autovectorization :/ And unfortunately, gcc still uses rsqrtps + a Newton iteration (without FMA?!?), even with -mrecip=none, which I thought was supposed to disable this.

Autovectorizes with only 5 vblendvps with clang5.0 (with the same options). See both on the Godbolt compiler explorer. This compiles, and looks like probably the right amount of instructions, but otherwise untested.

// I think this is far more than enough digits for float precision, but wouldn't hurt to use a standard constant instead of what I typed from memory.
static const float pi_2 = 3.1415926535897932384626433 / 2;
static const float pi = 3.1415926535897932384626433;
//static const float pi_plus_pi_2 = 3.1415926535897932384626433 * 3.0 / 2;

/* maximum error UNKNOWN, completely UNTESTED */
float my_acosf_branchless (float a)
{
    float r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    bool a_in_range = !(a > 0.0f) && (a >= -1.0f);

    bool rsmall = (r > -0.5625f);
    float asinf_arg = rsmall ? r : sqrtf (fmaf (0.5f, r, 0.5f));

    float asinf_res = asinf_core(asinf_arg);

#if 0
    r = fmaf( rsmall?-1.0f:2.0f,  asinf_res,  rsmall ? pi_2 : 0);
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
#else
    float fma_mul = rsmall? -1.0f:2.0f;
    fma_mul = a_in_range ? -fma_mul : fma_mul;
    float fma_add = rsmall ? pi_2 : 0;
    fma_add = a_in_range ? fma_add + pi : fma_add;
    // to vectorize, turn the 2 conditions into a 2-bit integer.
    // Use vpermilps as a 2-bit LUT of float constants

    // clang doesn't see the LUT trick, but otherwise appears non-terrible at this blending.

    r = fmaf(asinf_res, fma_mul, fma_add);
#endif
    return r;
}

Auto-vectorization tested with a loop that runs it over an array of 1024 aligned float elements; see the Godbolt link.

TODO: intrinsics version.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • Any chance I could convince you to show actual code that I can feed to compilers at Compiler Explorer? :-) That would be easier to read than a textual description, and probably shorter as well. As I recall, I *cannot* reduce the accuracy of the PI-related constants without bumping the error bound in `acosf()` above 1.5 ulp, which is undesirable. – njuffa Mar 04 '18 at 03:13
  • @njuffa: sure, I'll take a shot at it. It might be useful for people that don't need 1.5 ulp, but it's too bad we'd probably lose that nice property. – Peter Cordes Mar 04 '18 at 03:15
  • @njuffa: ok done, scalar version which compiles. Beware of bugs, especially for the 4 combinations of conditions on `r` and `a` if I got anything mixed up there. – Peter Cordes Mar 04 '18 at 04:38
  • I have started looking at this and am currently trying to track down the source of the following error (first reported): `@ a = bf0fffff -5.62499940e-001 res=4.11498260e+000 ref=2.16820267e+000` – njuffa Mar 05 '18 at 17:26
  • Fully functional and accurate implementation of this idea: `float my_acosf_branchless (float a) { int apos, rsmall, idx; float r, asinf_arg, asinf_res; const float pi = 3.14159265f; const float mtab[4] = {-2.f, -1.f, 2.f, 1.f}; const float atab[4] = {pi, pi/2, 0, pi/2}; apos = a > 0.f; r = apos ? (-a) : a; rsmall = r > -.5625f; asinf_arg = rsmall ? r : sqrtf (fmaf (.5f, r, .5f)); asinf_res = asinf_core (asinf_arg); idx = 2*apos+rsmall; r = fmaf (asinf_res, mtab[idx], atab[idx]); return r; }` – njuffa Mar 05 '18 at 18:42
  • 1
    Maximum error of `my_acosf_branchless()` in my preceding comment is 1.456672 ulp. No luck yet in my attempts to massage the code into something that can be automatically vectorized by a compiler. – njuffa Mar 05 '18 at 18:55
  • Clang 5.0 [vectorizes](https://godbolt.org/g/9p51Qu) w/o `-ffast-math`: `float my_acosf_branchless(float a) { bool apos, rsmall; float r, asinf_arg, asinf_res, scale, ofs; const float pi = 3.14159265f; apos = a > 0.0f; r = apos ? (-a) : a; rsmall = r > -0.5625f; asinf_arg = rsmall ? r : sqrtf (fmaf (0.5f, r, 0.5f)); asinf_res = asinf_core (asinf_arg); scale = rsmall ? 1.0f : 2.0f; scale = apos ? scale : (-scale); ofs = rsmall ? (pi/2) : 0.0f; ofs= apos ? ofs : (pi - ofs); r = fmaf (asinf_res, scale, ofs); return r; }` – njuffa Mar 05 '18 at 20:03
  • I am going to accept this as the answer since this approach seems to be slightly more palatable to automatic vectorizers in current compiler (gcc with `-ffast-path and Clang with `-fno-honor-nans, -fno-math-errno, -fno-trapping-math`; no luck with icc). – njuffa Mar 06 '18 at 05:25
  • @njuffa: I'll hopefully get around to playing with this some more soon, and update the answer with your versions and / or and intrinsic version. I went low effort the first time because there was doubt about it being accurate enough. – Peter Cordes Mar 06 '18 at 05:34
2

This is not quite an alternative algorithmic approach, but nevertheless this extended remark might be of interest to you.

It seems that, with gcc, the function copysignf() is easier to vectorize than the ternary operator. In the following code I have rewritten your scalar solution with copysignf() instead of ternary operators.

The code vectorizes even with the fairly old gcc 4.9 compiler, with the options gcc -std=c99 -O3 -m64 -Wall -march=haswell -fno-math-errno. The sqrtf() function is vectorized to a vsqrtps instruction. The Godbolt link is here.

#include <stdio.h>
#include <immintrin.h>
#include <math.h>

float acosf_cpsgn (float a)
{
    float r, s, t, pi2;
/*    s = (a < 0.0f) ? 2.0f : (-2.0f); */
    s = copysignf(2.0f, -a);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
/*    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r      */
/*    r = (a < 0.0f) ? t : r;                                           */
    r = copysignf(r, a);
    pi2 = 0x1.ddcb02p+0f * 0.5f;                   /* no rounding here  */
    pi2 = pi2 - copysignf(pi2, a);                 /* no rounding here  */
    t = fmaf (pi2, 0x1.aee9d6p+0f, r);   // PI-r
    return t;
}



float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}


/* The code from the next 2 functions is copied from the godbold link in Peter cordes'  */
/* answer https://stackoverflow.com/a/49091530/2439725  and modified                    */
int autovec_test_a (float *__restrict dst, float *__restrict src) {
    dst = __builtin_assume_aligned(dst,32);
    src = __builtin_assume_aligned(src,32);
    for (int i=0 ; i<1024 ; i++ ) {
        dst[i] = my_acosf(src[i]);
    }
    return 0;
}

int autovec_test_b (float *__restrict dst, float *__restrict src) {
    dst = __builtin_assume_aligned(dst,32);
    src = __builtin_assume_aligned(src,32);
    for (int i=0 ; i<1024 ; i++ ) {
        dst[i] = acosf_cpsgn(src[i]);
    }
    return 0;
}
wim
  • 3,702
  • 19
  • 23
  • Thanks for raising this particular aspect of vectorizable idioms. Your variant seems to work fine, except that my test framework complains about the wrong "sign" for NaN results: `arg= 0x1.000002p+0 res= 0x1.#QNAN0p+0 (7fc00000) ref=-0x1.#IND00p+0 (ffc00000)`. I guess the NaN INDEFINITE produced by `sqrt()` gets mangled during final fixup. I will ponder whether there is an easy fix to that. – njuffa Mar 20 '18 at 21:39
  • 1
    I have experimented a bit with inserting `t = copysignf(t, 1.0f - fabsf(a));` at the end of the function, right before the `return t;`. I did not test it for all existing floats, but at least for the arguments 2.0f and -2.0f it works. With these inputs the result is `res = 0xFFC00000`, which is the same as for `my_acosf()`. – wim Mar 20 '18 at 22:35
  • @njuffa: Do you really need the right sign on NaN? If so, and NaN is rare, a manual-vectorized version could maybe put the a sign-copy inside an `if(movmskps(cmpunordps(input,input)) != 0)`. So branch on the input having any NaNs, and then copy the signs (or the NaNs themselves with a blendv) for the elements that had input NaN. This takes the sign-copying off the critical path in the normal case where there are no NaNs, and replaces it with 3 uops: `vcmpeqps` / `vmovmskps` / `test+jcc`. They're off the critical path like all branches, and branching on `input` can detect mispredicts early. – Peter Cordes Mar 20 '18 at 23:31
  • @PeterCordes My personal design philosophy: I always like to match the reference implementation exactly for high-performance implementations if this is possible without noticeable performance impact. This is often possible without creating additional code branches if only one thinks about it carefully. In this case my hand-vectorized AVX2 version above matches exactly. – njuffa Mar 20 '18 at 23:43