Below I am showing an ISO-C99 implementation of float
to half
conversion that has been tested exhaustively. The following assumptions apply: float
maps to IEEE-754 binary32
format while half
maps to IEEE-754 binary16
format; both floating-point and integer data types use the same endianness when stored; conversion to a narrower floating-point type should utilize the rounding mode to-nearest-or-even.
As a golden reference the test framework uses the x86-64 instruction set extension F16C
, introduced in 2011 to support half precision (FP16) as a storage type. IEEE-754 NaN handling contains some architecture specific elements, and the float2half_rn()
function below was designed to mimic the x86-64 behavior. Adjustments, for example switching to the use of a single canonical NaN encoding, are trivial.
The code below is derived from code that I previously published under a BSD license here. I used the Intel Compiler Version 13.1.3.198 Build 20130607 to build this code and ran the exhaustive test on an IvyBridge CPU.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "immintrin.h"
uint32_t float_as_uint32 (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}
uint16_t float2half_rn (float a)
{
uint32_t ia = float_as_uint32 (a);
uint16_t ir;
ir = (ia >> 16) & 0x8000;
if ((ia & 0x7f800000) == 0x7f800000) {
if ((ia & 0x7fffffff) == 0x7f800000) {
ir |= 0x7c00; /* infinity */
} else {
ir |= 0x7e00 | ((ia >> (24 - 11)) & 0x1ff); /* NaN, quietened */
}
} else if ((ia & 0x7f800000) >= 0x33000000) {
int shift = (int)((ia >> 23) & 0xff) - 127;
if (shift > 15) {
ir |= 0x7c00; /* infinity */
} else {
ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
if (shift < -14) { /* denormal */
ir |= ia >> (-1 - shift);
ia = ia << (32 - (-1 - shift));
} else { /* normal */
ir |= ia >> (24 - 11);
ia = ia << (32 - (24 - 11));
ir = ir + ((14 + shift) << 10);
}
/* IEEE-754 round to nearest of even */
if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
ir++;
}
}
}
return ir;
}
uint16_t float2half_rn_ref (float a)
{
__m128 pa = _mm_set_ps1 (a);
__m128i r16 = _mm_cvtps_ph (pa, _MM_FROUND_TO_NEAREST_INT);
uint16_t res;
memcpy (&res, &r16, sizeof res);
return res;
}
float uint32_as_float (uint32_t a)
{
float r;
memcpy (&r, &a, sizeof r);
return r;
}
int main (void)
{
float arg;
uint16_t resi, refi;
uint32_t argi = 0;
do {
arg = uint32_as_float (argi);
refi = float2half_rn_ref (arg);
resi = float2half_rn (arg);
if (resi != refi) {
printf ("error @ %15.8e (%08x): resi=%04x refi=%04x\n",
arg, argi, resi, refi);
return EXIT_FAILURE;
}
argi++;
if ((argi & 0xffffff) == 0) printf ("\r%08x", argi);
} while (argi);
return EXIT_SUCCESS;
}