IEEE 754 conformant sqrtf() implementation taking into account hardware restrictions and usage limitations

Question

Follow-up question for IEEE 754 conformant sqrt() implementation for double type.

Context: Need to implement IEEE 754 conformant sqrtf() taking into account the following HW restrictions and usage limitations:

Provides a special instruction qseed.f to get an approximation of the reciprocal of the square root (the accuracy of the result is no less than 6.75 bits, and therefore always within ±1% of the accurate result).
Single precision FP:

a. Support by HW (SP FPU): has support;

b. Support by SW (library): has support;

c. Support of subnormal numbers: no support (FLT_HAS_SUBNORM is 0).
Double precision FP:

a. Support by HW (DP FPU): no support;

b. Support by SW (library): has support;

c. Support of subnormal numbers: no support (DBL_HAS_SUBNORM is 0).

I've found one presentation by John Harrison and ended up with this implementation (note that here qseed.f is replaced by rsqrtf()):

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

// https://github.com/nickzman/hyperspace/blob/master/frsqrt.hh
#if 1
float rsqrtf ( float x )
{
    const float xhalf = 0.5f * x;
    int         i     = *(int*) & x;

    i = 0x5f375a86 - ( i >> 1 );
    x = *(float*) & i;
    x = x * ( 1.5f - xhalf * x * x );
    x = x * ( 1.5f - xhalf * x * x );
    x = x * ( 1.5f - xhalf * x * x );

    return x;
}
#else
float rsqrtf ( float x )
{
    return 1.0f / sqrtf( x );
}
#endif

float   sqrtfr_jh( float x, float r )
{
    /*
     * John Harrison, Formal Verification Methods 5: Floating Point Verification,
     * Intel Corporation, 12 December 2002, document name: slides5.pdf, page 14,
     * slide "The square root algorithm".
     * URL: https://www.cl.cam.ac.uk/~jrh13/slides/anu-09_12dec02/slides5.pdf
     */
    double rd, b, z0, s0, d, k, h0, e, t0, s1, c, d1, h1, s;
    static const double half = 0.5;
    static const double one = 1.0;
    static const double three = 3.0;
    static const double two = 2.0;
    rd = (double)r;
    b = half * x;
    z0 = rd * rd;
    s0 = x * rd;
    d = fma( -b, z0, half );
    k = fma( x, rd, -s0 );
    h0 = half * rd;
    e = fma( three / two, d, one );
    t0 = fma( d, s0, k );
    s1 = fma( e, t0, s0 );
    c = fma( d, e, one );
    d1 = fma( -s1, s1, x );
    h1 = c * h0;
    s = fma( d1, h1, s1 );
    return (float)s;
}

float   my_sqrtf( float x )
{
    /* handle special cases */
    if (x == 0) {
        return x + x;
    }
    /* handle normal cases */
    if ((x > 0) && (x < INFINITY)) {
        return sqrtfr_jh( x, rsqrtf( x ) );
    }
    /* handle special cases */
    return (x < 0) ? NAN : (x + x);
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    const uint64_t N = 10000000000ULL; /* desired number of test cases */
    float arg, ref, res;
    uint64_t argi64;
    uint32_t refi, resi;
    uint64_t count = 0;
    float spec[] = {0.0f, 1.0f, INFINITY, NAN};

    printf ("test a few special cases:\n");
    for (int i = 0; i < sizeof (spec)/sizeof(spec[0]); i++) {
        printf ("my_sqrt(%a) = %a\n", spec[i], my_sqrtf(spec[i]));
        printf ("my_sqrt(%a) = %a\n", -spec[i], my_sqrtf(-spec[i]));
    }
    
    printf ("test %lu random cases:\n", N);
    do {
        argi64 = KISS64;
        memcpy (&arg, &argi64, sizeof arg);
        if ( fpclassify(arg) == FP_SUBNORMAL )
        {
            continue;
        }
        ++count;
        res = my_sqrtf (arg);
        ref = sqrtf (arg);
        memcpy (&resi, &res, sizeof resi);
        memcpy (&refi, &ref, sizeof refi);
        if ( ! ( isnan(res) && isnan(ref) ) )
        if (resi != refi) {
            printf ("\rerror @ arg=%a (%e)\n", arg, arg);
            printf ("\rerror @ res=%a (%e)\n", res, res);
            printf ("\rerror @ ref=%a (%e)\n", ref, ref);
            return EXIT_FAILURE;
        }
        if ((count & 0xfffff) == 0) printf ("\r[%lu]", count);
    } while (count < N);
    printf ("\r[%lu]", count);
    printf ("\ntests PASSED\n");
    return EXIT_SUCCESS;
}

And it seems to work correctly (at least for some random cases): it reports:

[10000000000]
tests PASSED

Now the question: since the original John Harrison's sqrtf() algorithm uses only single precision computations (i.e. type float), it is possible to reduce the number of operations when using only (except conversions) double precision computations (i.e. type double) and still be IEEE 754 conformant?

P.S. Since users @njuffa and @chux - Reinstate Monica are strong in FP, I invite them to participate. However, all the competent in FP users are welcome.

What is the role of `float rsqrtf ( float x ) { return 1.0f / sqrtf( x );`? A hobbled reference function? — chux - Reinstate Monica, Mar 22 '21 at 21:51
@pmor Computing a single-precision `sqrtf` via double precision when there is no double-precision support in hardware seems extremely inefficient. What is the objective here? Is the question actually: What is an efficient way of computing `sqrtf` using only single-precision operations? FWIW, single-argument single-precision functions should be tested exhaustively. It takes only minutes to do so. — njuffa, Mar 22 '21 at 22:13
@pmor Does this hardware platform actually support fused-multiply add operations, or is `fmaf()` implemented via software emulation? If the latter, you will probably need to use integer operations to achieve correctly rounded `sqrtf`. — njuffa, Mar 22 '21 at 22:47
@njuffa (1) The issue is that single precision FPU (SP FPU) has no support of subnormal numbers, and I thought that the support of subnormal numbers is required to implement IEEE 754 conformant `sqrtf()`. I also thought that this "subnormal numbers issue" can be solved if using doubles (even if they don't support subnormal numbers). — pmor, Mar 23 '21 at 19:09
@njuffa (2) Yes, the HW supports `fmaf()`, however, w/o support of subnormal numbers: _The FPU has the following restrictions and usage limitations: a. Only IEEE-754 single precision format is supported, b. IEEE-754 denormalized numbers are not supported for arithmetic operations_. — pmor, Mar 23 '21 at 19:09
@njuffa (3) The objective here is to implement IEEE 754 conformant `sqrtf()` using single precision FPU (SP FPU) which has no support of subnormal numbers. — pmor, Mar 23 '21 at 19:12
@njuffa _FWIW, single-argument single-precision functions should be tested exhaustively_: interesting! Thanks for the confirmation! — pmor, Mar 23 '21 at 19:13
@pmor It makes *no* sense to implement `fsqrtf()` with support for subnormals in a floating-point environment that does not support subnormals. Since your platform does not support subnormals, it is (presumably) *compatible* with IEEE-754, but *not compliant*. This is a fairly common scenario for low-end processors or processor focused on signal processing or 3D graphics. — njuffa, Mar 23 '21 at 19:19
@njuffa _It makes no sense to implement fsqrtf() with support for subnormals in a floating-point environment that does not support subnormals_: I meant different thing. I meant that support of subnormals may be required to implement `fsqrtf()` w/o support for subnormals as input. Ex.: `sqrtff(3.402823e+38f)` using John Harrison's algorithm. `rsqrtf( x )` gives `5.42101086e-20` (normal). According to the algorithm `z0 = y0^2;` (see slides5.pdf, page 14), which is `5.42101086e-20 * 5.42101086e-20 = 2.938736E-39 (subnormal)`. Hence, in case of `FLT_HAS_SUBNORM is 0`, `z0` will be 0, which ... — pmor, Mar 23 '21 at 20:08
... invalidates the rest of the computations. That is why I've thought that moving to doubles can solve this "subnormal numbers issue". — pmor, Mar 23 '21 at 20:08
@pmor As I point out in my answer, one has to guard against overflow and underflow in intermediate computation. This often becomes a tad harder when there is no support for subnormals in basic arithmetic operations provided by the hardware and things "fall over a cliff". Most papers and presentations by researchers do not bother with such issues, they focus on fastpath implementation only. I tested the code I posted in an environment with subnormal support, and another one without subnormal support, and in both cases it delivers correct results consistent with the properties of the environment — njuffa, Mar 23 '21 at 20:24
@njuffa Thanks. Out of curiosity: which environment without subnormal support do you use? Is it PC based? (I mean can I use regular PC?) QEMU? P.S. I find strange that HW which I work with does not have an FPU option to _disable_ support of subnormals (either for SP FPU, or for DP FPU, or for both FPUs). Any ideas why? A defect in the FPU design? — pmor, Mar 23 '21 at 20:38
@pmor I used CUDA and compiled with `ftz=true`, where "ftz" stands for flush to zero. On PCs, SSE has a control word that lets programmers specify DAZ and FTZ modes. See `_MM_SET_DENORMALS_ZERO_MODE()`, `_MM_SET_FLUSH_ZERO_MODE()`. IMHO, flush-to-zero mode is a kludge to (1) simplify hardware (2) get around performance issues arising from the subnormal support implemented in many processors (which often takes the form of internal exception mechanism). — njuffa, Mar 23 '21 at 20:45
@njuffa Thanks, interesting. Btw, our code base already has one reference to you: `$ grep -r njuffa src -A 1` gives `roundg.c:// the idea to add and subtract 2^52 is from njuffa \n roundg.c-// http://stackoverflow.com/questions/14919512/`. — pmor, Mar 23 '21 at 21:00
@pmor The idea of rounding to integer using a "magic addition" is not mine. I *think* I learned that from the PowerPC Compiler Writer's Guide some 25 years ago. But my memory is hazy, it may have been from an internet newsgroup instead. My extension is to roll that magic addition into an FMA, where applicable. So who is "us", if I may ask? — njuffa, Mar 23 '21 at 22:07
@njuffa Us is [tasking.com](https://www.tasking.com/) (I am junior there, so, it is more likely "me dealing with their source code"). — pmor, Mar 24 '21 at 14:09
@pmor That would explain the Tricore-like platform you seem to be using :-). Probably best to delete your comment, now that I have satisfied my curiosity. — njuffa, Mar 24 '21 at 17:06
@njuffa Legal issue. The _content contributed on or after 2018-05-02 (UTC) is distributed under the terms of CC BY-SA 4.0_ ([link](https://stackoverflow.com/help/licensing)). And `CC BY-SA 4.0` _may have_ (compatibility) issues with licenses for commercial / proprietary software. Hence, I have to ask for a permission of using the source code of `my_sqrtf()` in the product. Do you give such permission? Under what conditions? Thanks for understanding. — pmor, Mar 24 '21 at 19:59
@pmor AFAIK, CC anything is generally not a suitable license for software. This is a long-standing deficiency with Stackoverflow that they have failed to resolve. In other venues I post code under an OSI-compliant 2-clause BSD license which makes it suitable for inclusion in any kind of software. As I understand Stackoverflow usage terms, I cannot use a comment block with the license for code posted here. — njuffa, Mar 24 '21 at 20:24
@njuffa Thanks! Yes, I see that [CC BY-SA is discouraged for code](https://opensource.stackexchange.com/questions/1717/why-is-cc-by-sa-discouraged-for-code). — pmor, Mar 24 '21 at 22:48
@pmor Have a look at the NVIDIA forums: https://forums.developer.nvidia.com/t/performance-tweak-for-single-precision-square-root/173274 — njuffa, Mar 25 '21 at 01:17
@njuffa Thanks. Impressive. About "modest, only a few percent": I've learned that even tenths of a percent count. For the `memcpy` (`float_as_uint32` / `uint32_as_float`): why not using type punning via `union` (`typedef union { uint32_t u; float f; } u_t;`)? — pmor, Mar 25 '21 at 17:49
@pmor Using a `union` for type punning has well-defined behavior only in C. `memcpy` for type punning has well-defined behavior in C and C++. Modern compilers know how to optimize out the `memcpy`, so I switched from the `union` method to the `memcpy` method about five years ago. — njuffa, Mar 25 '21 at 17:55
@njuffa Thanks, understand. [This](https://stackoverflow.com/a/31080901/1778275) answer tells the same: _using memcpy should generate identical code and is does not break optimizations and well defined behavior_ in both C and C++. — pmor, Mar 25 '21 at 18:05
@njuffa Out of curiosity: is it possible to mathematically prove that the minimum number of FPU instructions, which are needed in addition to `MUFU.RSQ`, is 4? And why exactly 4, if there are 4 `fmaf` calls, 6 multiplications (`slowpath` version, which is worst case), and 1 negation (not counting `-0.5f`)? — pmor, Mar 25 '21 at 18:18
@pmor Sorry, this is getting out of hand, and I will stop responding here. This is a Q&A site, not a discussion forum or chat room. The work of Peter Markstein showed that if the reciprocal square root is accurate to 1 ulp, a four instruction FMA-based sequence is sufficient to get a correctly rounded square root. In the fastpath. As I said, researchers focus on the fastpath. Negations of source operands are typically rolled into various FMA variants. — njuffa, Mar 25 '21 at 18:56
@njuffa Cc'd your nvidia email. Got `550 5.4.1 Recipient address rejected: Access denied. AS(201806281)`. Is it normal? — pmor, Jun 12 '21 at 21:59
@njuffa Then which email can be used? Guess that iit, nexgen, earthlink are all outdated now. — pmor, Jun 14 '21 at 08:02

njuffa · Accepted Answer · 2021-03-24T23:52:48.383

Computing a single-precision square root via double-precision code is going to be inefficient, especially if the hardware provides no native double-precision operations.

The following assumes hardware that conforms to IEEE-754 (2008), except that subnormals are not supported and flushed to zero. Fused-multiply add (FMA) is supported. It further assumes an ISO-C99 compiler that maps float to IEEE-754 binary32, and that maps the hardware's single-precision FMA instruction to the standard math function fmaf().

From a hardware starting approximation for the reciprocal square root with a maximum relative error of 2^-6.75 one can get to a reciprocal square root accurate to 1 single-precision ulp with two Newton-Raphson iterations. Multiplying this with the original argument provides an accurate estimate of the square root. The square of this approximation is subtracted from the orginal argument to compute the approximation error for the square root. This error is then used to apply a correction to the square root approximation, resulting in a correctly-rounded square root.

However, this straightforward algorithm breaks down for arguments that are very small due to underflow or overflow in intermediate computation, in particular when the underlying arithmetic operates in flash-to-zero mode that flushes subnormals to zero. For such arguments we can construct a slowpath code that scales the input towards unity, and scales back the result accordingly once the square root has been computed. Code for handling special operands such as zeros, infinities, NaNs, and negative arguments other than zero is also added to this slowpath code.

The NaN generated by the slowpath code for invalid operations should be adjusted to match the system's existing operations. For example, for x86-based systems this would be a special QNaN called INDEFINITE, with a bit pattern of 0xffc00000, while for a GPU running CUDA it would be the canonical single-precision NaN with a bit pattern of 0x7fffffff.

For performance reasons it may be useful to inline the fastpath code while making the slowpath code a called outlined subroutine. Single-precision math functions with a single argument should always be tested exhaustively against a "golden" reference implementation, which takes just minutes on modern hardware.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

float uint32_as_float (uint32_t);
uint32_t float_as_uint32 (float);
float qseedf (float);
float sqrtf_slowpath (float);

/* Square root computation for IEEE-754 binary32 mapped to 'float' */
float my_sqrtf (float arg)
{
    const uint32_t upper = float_as_uint32 (0x1.fffffep+127f);
    const uint32_t lower = float_as_uint32 (0x1.000000p-102f);
    float rsq, sqt, err;

    /* use fastpath computation if argument in [0x1.0p-102, 0x1.0p+128) */
    if ((float_as_uint32 (arg) - lower) <= (upper - lower)) {
        /* generate low-accuracy approximation to rsqrt(arg) */
        rsq = qseedf (arg);
        
        /* apply two Newton-Raphson iterations with quadratic convergence */
        rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
        rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
        
        /* compute sqrt from rsqrt, round result to nearest or even */
        sqt = rsq * arg;
        err = fmaf (sqt, -sqt, arg);
        sqt = fmaf (0.5f * rsq, err, sqt);
    } else {
        sqt = sqrtf_slowpath (arg);
    }
    return sqt;
}

/* interprete bit pattern of 32-bit unsigned integer as IEEE-754 binary32 */
float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r, &a, sizeof r);
    return r;
}

/* interprete bit pattern of  IEEE-754 binary32 as a 32-bit unsigned integer */
uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

/* simulate low-accuracy hardware approximation to 1/sqrt(a) */
float qseedf (float a)
{
    float r = 1.0f / sqrtf (a);
    r = uint32_as_float (float_as_uint32 (r) & ~0x1ffff);
    return r;
}

/* square root computation suitable for all IEEE-754 binary32 arguments */
float sqrtf_slowpath (float arg)
{
    const float FP32_INFINITY = uint32_as_float (0x7f800000);
    const float FP32_QNAN = uint32_as_float (0xffc00000); /* system specific */
    const float scale_in  = 0x1.0p+26f;
    const float scale_out = 0x1.0p-13f;
    float rsq, err, sqt;

    if (arg < 0.0f) {
        return FP32_QNAN;
    } else if ((arg == 0.0f) || !(fabsf (arg) < FP32_INFINITY)) { /* Inf, NaN */
        return arg + arg;
    } else {
        /* scale subnormal arguments towards unity */
        arg = arg * scale_in;
        
        /* generate low-accuracy approximation to rsqrt(arg) */
        rsq = qseedf (arg);
        
        /* apply two Newton-Raphson iterations with quadratic convergence */
        rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
        rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
        
        /* compute sqrt from rsqrt, round to nearest or even */
        sqt = rsq * arg;
        err = fmaf (sqt, -sqt, arg);
        sqt = fmaf (0.5f * rsq, err, sqt);

        /* compensate scaling of argument by counter-scaling the result */
        sqt = sqt * scale_out;
        
        return sqt;
    }
}

int main (void)
{
    uint32_t ai, resi, refi;
    float a, res, reff;
    double ref;

    ai = 0x00000000;
    do {
        a = uint32_as_float (ai);
        res = my_sqrtf (a);
        ref = sqrt ((double)a);
        reff = (float)ref;
        resi = float_as_uint32 (res);
        refi = float_as_uint32 (reff);
        if (resi != refi) {
            printf ("error @ %08x %15.8e   res=%08x %15.8e  ref=%08x %15.8e\n",
                    ai, a, resi, res, refi, reff);
            return EXIT_FAILURE;
        }
        

        ai++;
    } while (ai);
    return EXIT_SUCCESS;
}

IEEE 754 conformant sqrtf() implementation taking into account hardware restrictions and usage limitations

1 Answers1