Convert extended precision float (80-bit) to double (64-bit) in MSVC

Question

What is the most portable and "right" way to do conversion from extended precision float (80-bit value, also known as long double in some compilers) to double (64-bit) in MSVC win32/win64?

MSVC currently (as of 2010) assumes that long double is double synonym.

I could probably write fld/fstp assembler pair in inline asm, but inline asm is not available for win64 code in MSVC. Do I need to move this assembler code to separate .asm file? Is that really so there are no good solution?

score 5 · Accepted Answer · answered Oct 18 '10 at 16:00

Just did this in x86 code...

    .686P
    .XMM

_TEXT   SEGMENT

EXTRN   __fltused:DWORD

PUBLIC  _cvt80to64
PUBLIC  _cvt64to80

_cvt80to64 PROC

    mov eax, dword ptr [esp+4]
    fld TBYTE PTR [eax]

    ret 0
_cvt80to64 ENDP


_cvt64to80 PROC
    mov eax, DWORD PTR [esp+12]
    fld QWORD PTR [esp+4]
    fstp    TBYTE PTR [eax]
    ret 0
_cvt64to80 ENDP

ENDIF

_TEXT   ENDS
    END

score 4 · Answer 2 · answered Oct 16 '10 at 14:49

If your compiler / platform doesn't have native support for 80 bit floating point values, you have to decode the value yourself.

Assuming that the 80 bit float is stored within a byte buffer, located at a specific offset, you can do it like this:

float64 C_IOHandler::readFloat80(IColl<uint8> buffer, uint32 *ref_offset)
{
    uint32 &offset = *ref_offset;

    //80 bit floating point value according to the IEEE-754 specification and the Standard Apple Numeric Environment specification:
    //1 bit sign, 15 bit exponent, 1 bit normalization indication, 63 bit mantissa

    float64 sign;
    if ((buffer[offset] & 0x80) == 0x00)
        sign = 1;
    else
        sign = -1;
    uint32 exponent = (((uint32)buffer[offset] & 0x7F) << 8) | (uint32)buffer[offset + 1];
    uint64 mantissa = readUInt64BE(buffer, offset + 2);

    //If the highest bit of the mantissa is set, then this is a normalized number.
    float64 normalizeCorrection;
    if ((mantissa & 0x8000000000000000) != 0x00)
        normalizeCorrection = 1;
    else
        normalizeCorrection = 0;
    mantissa &= 0x7FFFFFFFFFFFFFFF;

    offset += 10;

    //value = (-1) ^ s * (normalizeCorrection + m / 2 ^ 63) * 2 ^ (e - 16383)
    return (sign * (normalizeCorrection + (float64)mantissa / ((uint64)1 << 63)) * g_Math->toPower(2, (int32)exponent - 16383));
}

This is how I did it, and it compiles fine with g++ 4.5.0. It of course isn't a very fast solution, but at least a functional one. This code should also be portable to different platforms, though I didn't try.

Calmarius · Answer 3 · 2016-06-21T05:37:04.923

I've just written this one. It constructs an IEEE double number from IEEE extended precision number using bit operations. It takes the 10 byte extended precision number in little endian format:

typedef unsigned long long uint64;

double makeDoubleFromExtended(const unsigned char x[10])
{
    int exponent = (((x[9] << 8) | x[8]) & 0x7FFF);
    uint64 mantissa =
        ((uint64)x[7] << 56) | ((uint64)x[6] << 48) | ((uint64)x[5] << 40) | ((uint64)x[4] << 32) | 
        ((uint64)x[3] << 24) | ((uint64)x[2] << 16) | ((uint64)x[1] << 8) | (uint64)x[0];
    unsigned char d[8] = {0};
    double result;

    d[7] = x[9] & 0x80; /* Set sign. */

    if ((exponent == 0x7FFF) || (exponent == 0))
    {
        /* Infinite, NaN or denormal */
        if (exponent == 0x7FFF)
        {
            /* Infinite or NaN */
            d[7] |= 0x7F;
            d[6] = 0xF0;
        }
        else
        {
            /* Otherwise it's denormal. It cannot be represented as double. Translate as singed zero. */
            memcpy(&result, d, 8);
            return result;
        }
    }
    else
    {
        /* Normal number. */
        exponent = exponent - 0x3FFF + 0x03FF; /*< exponent for double precision. */

        if (exponent <= -52)  /*< Too small to represent. Translate as (signed) zero. */
        {
            memcpy(&result, d, 8);
            return result;
        }
        else if (exponent < 0)
        {
            /* Denormal, exponent bits are already zero here. */
        }
        else if (exponent >= 0x7FF) /*< Too large to represent. Translate as infinite. */
        {
            d[7] |= 0x7F;
            d[6] = 0xF0;
            memset(d, 0x00, 6);
            memcpy(&result, d, 8);
            return result;
        }
        else
        {
            /* Representable number */
            d[7] |= (exponent & 0x7F0) >> 4;
            d[6] |= (exponent & 0xF) << 4;
        }
    }
    /* Translate mantissa. */

    mantissa >>= 11;

    if (exponent < 0)
    {
        /* Denormal, further shifting is required here. */
        mantissa >>= (-exponent + 1);
    }

    d[0] = mantissa & 0xFF;
    d[1] = (mantissa >> 8) & 0xFF;
    d[2] = (mantissa >> 16) & 0xFF;
    d[3] = (mantissa >> 24) & 0xFF;
    d[4] = (mantissa >> 32) & 0xFF;
    d[5] = (mantissa >> 40) & 0xFF;
    d[6] |= (mantissa >> 48) & 0x0F;

    memcpy(&result, d, 8);

    printf("Result: 0x%016llx", *(uint64*)(&result) );

    return result;
}

I think that the treatment of the case `if (exponent <= 0)` means that numbers that could be represented as binary64 subnormals end up represented as `0.0`. — Pascal Cuoq, Sep 17 '13 at 16:57
Fixed it. The exact `exponent == 0` case can indeed yield a denormal double. — Calmarius, Sep 17 '13 at 20:08
It's still not quite right: the cases in which there is something to do are the cases when `exponent` is between -52 and 0 (at that point of the code), and what must be done is **roughly** to make the implicit bit explicit, shift the significand that was going to be used to the right by `-exponent`, and set `exponent` to zero. The OP ended up using `FSTP`, so it's not very important, but you'll need to do it for `FSTP` in your emulator :) — Pascal Cuoq, Sep 18 '13 at 13:33
Scratch the part about making an implicit bit explicit: the leading 1 is already explicit in `mantissa`. — Pascal Cuoq, Sep 18 '13 at 14:11
Just saw this while looking for something else, and... out of curiosity, why did you `typedef unsigned __int64 uint64;`, instead of using `uint64_t` from ``? Was it to remove a dependency that isn't really necessary because you're only using a single name, because your compiler didn't support it, or for some other reason? — Justin Time - Reinstate Monica, Jun 20 '16 at 21:53
@JustinTime don't know... VC9 don't have stdint. Replaced it with long long. — Calmarius, Jun 21 '16 at 05:39
@Calmarius I wasn't asking why you used `__int64` instead of `long long`, just why you made your own `uint64` instead of using `uint64_t`. According to the MSDN, `__int64` was added in VS 6.0 (so it was indeed present in VS9); apparently, they actually had `__int64` before they had `long long`. My question was just why you made your own typedef, instead of using the functionally identical one from the standard header; I wasn't aware that VS9 didn't have ``, and thought you did it because you didn't need any other types from the header. — Justin Time - Reinstate Monica, Jun 22 '16 at 17:11

score 0 · Answer 4 · answered Dec 08 '11 at 08:47

Played with the given answers and ended up with this.

#include <cmath>
#include <limits>
#include <cassert>

#ifndef _M_X64

__inline __declspec(naked) double _cvt80to64(void* ) {
  __asm {
  //  PUBLIC _cvt80to64 PROC

    mov eax, dword ptr [esp+4]
    fld TBYTE PTR [eax]

    ret 0
  //    _cvt80to64 ENDP
  }
}

#endif

#pragma pack(push)
#pragma pack(2)
typedef unsigned char tDouble80[10];
#pragma pack(pop)


typedef struct {
  unsigned __int64 mantissa:64;
  unsigned int exponent:15;
  unsigned int sign:1;
} tDouble80Struct;

inline double convertDouble80(const tDouble80& val)
{
  assert(10 == sizeof(tDouble80));

  const tDouble80Struct* valStruct = reinterpret_cast<const tDouble80Struct*>(&val);

  const unsigned int mask_exponent = (1 << 15) - 1;
  const unsigned __int64 mantissa_high_highestbit = unsigned __int64(1) << 63;
  const unsigned __int64 mask_mantissa = (unsigned __int64(1) << 63) - 1;

  if (mask_exponent == valStruct->exponent) {

    if(0 == valStruct->mantissa) {
      return (0 != valStruct->sign) ? -std::numeric_limits<double>::infinity() : std::numeric_limits<double>::infinity();
    }

    // highest mantissa bit set means quiet NaN
    return (0 != (mantissa_high_highestbit & valStruct->mantissa)) ? std::numeric_limits<double>::quiet_NaN() :  std::numeric_limits<double>::signaling_NaN();
  }   

  // 80 bit floating point value according to the IEEE-754 specification and 
  // the Standard Apple Numeric Environment specification:
  // 1 bit sign, 15 bit exponent, 1 bit normalization indication, 63 bit mantissa

  const double sign(valStruct->sign ? -1 : 1);


  //If the highest bit of the mantissa is set, then this is a normalized number.
  unsigned __int64 mantissa = valStruct->mantissa;
  double normalizeCorrection = (mantissa & mantissa_high_highestbit) != 0 ? 1 : 0;
  mantissa &= mask_mantissa;

  //value = (-1) ^ s * (normalizeCorrection + m / 2 ^ 63) * 2 ^ (e - 16383)
  return (sign * (normalizeCorrection + double(mantissa) / mantissa_high_highestbit) * pow(2.0, int(valStruct->exponent) - 16383));
}

Convert extended precision float (80-bit) to double (64-bit) in MSVC

4 Answers4

Linked