Exponent range
for 32bit float the raw exponent rexp
is 8 bit <0,255>
and bias is 127
. Excluding special cases { 0,255 }
we got <1,254>
applying bias:
expmin = 1-127 = -126
expmax = 254-127 = +127
Denormal values are without implicit 1 so for minimal number the mantisa is 1
and if the exponent should point to lsb of mantisa then we need to shift few more:
expmin = 0-127-(23-1) = -149
Normal max value will be with maximal mantisa so:
max = ((2^24)-1)*(2^127) = (2^24)*(2^127) - (2^127) = 2^151 - 2^127
so the real range (denormals included) of float
is:
<2^-149 ,2^+151 )
<1.40e-45,2.85e+45)
In most specs and docs only the exponent for normalized numbers is shown so:
<2^-126 ,2^+127 >
<1.175e-38,1.701e38>
Here a small C++/VCL example of disecting the 32 and 64 bit floats:
//$$---- Form CPP ----
//---------------------------------------------------------------------------
#include <vcl.h>
#include <math.h>
#pragma hdrstop
#include "Unit1.h"
//---------------------------------------------------------------------------
#pragma package(smart_init)
#pragma resource "*.dfm"
TForm1 *Form1;
//---------------------------------------------------------------------------
typedef unsigned __int32 U32;
typedef __int32 S32;
//---------------------------------------------------------------------------
// IEEE 754 double MSW masks
const U32 _f64_sig =0x80000000; // sign
const U32 _f64_exp =0x7FF00000; // exponent
const U32 _f64_exp_sig=0x40000000; // exponent sign
const U32 _f64_exp_bia=0x3FF00000; // exponent bias
const U32 _f64_exp_lsb=0x00100000; // exponent LSB
const U32 _f64_exp_pos= 20; // exponent LSB bit position
const U32 _f64_man =0x000FFFFF; // mantisa
const U32 _f64_man_msb=0x00080000; // mantisa MSB
const U32 _f64_man_bits= 52; // mantisa bits
const double _f64_lsb = 1.7e-308; // abs min number
// IEEE 754 single masks <2^-149,2^+151) <1.40e-45,2.85e+45).
const U32 _f32_sig =0x80000000; // sign
const U32 _f32_exp =0x7F800000; // exponent
const U32 _f32_exp_sig=0x40000000; // exponent sign
const U32 _f32_exp_bia=0x3F800000; // exponent bias
const U32 _f32_exp_lsb=0x00800000; // exponent LSB
const U32 _f32_exp_pos= 23; // exponent LSB bit position
const U32 _f32_man =0x007FFFFF; // mantisa
const U32 _f32_man_msb=0x00400000; // mantisa MSB
const U32 _f32_man_bits= 23; // mantisa bits
const float _f32_lsb = 3.4e-38;// abs min number
//---------------------------------------------------------------------------
void f64_disect(double x)
{
const int h=1; // may be platform dependent MSB/LSB order
const int l=0;
union _f64
{
double f; // 64bit floating point
U32 u[2]; // 2x32 bit uint
} f64;
AnsiString txt="";
U32 man[2];
S32 exp,bias;
char sign='+';
f64.f=x;
bias=_f64_exp_bia>>_f64_exp_pos;
if (f64.u[h]&_f64_sig) sign='-';
exp =(f64.u[h]&_f64_exp)>>_f64_exp_pos;
exp -=bias;
man[h]=f64.u[h]&_f64_man;
man[l]=f64.u[l];
if (exp==-bias ) // zero, denormalized
{
exp-=_f64_man_bits-1; // change exp pointing from msb to lsb (ignoring implicit bit)
txt=AnsiString().sprintf("%c%06X%08Xh>>%4i",sign,man[h],man[l],-exp);
}
else if (exp==+bias+1) // Inf,NaN
{
if (man[h]|man[l]==0) txt=AnsiString().sprintf("%cInf ",sign);
else txt=AnsiString().sprintf("%cNaN ",sign);
man[h]=0; man[l]=0; exp=0;
}
else{
exp -=_f64_man_bits; // change exp pointing from msb to lsb
man[h]|=_f64_exp_lsb; // implicit msb mantisa bit for normalized numbers
txt=AnsiString().sprintf("%06X",man);
if (exp<0) txt=AnsiString().sprintf("%c%06X%08Xh>>%4i",sign,man[h],man[l],-exp);
else txt=AnsiString().sprintf("%c%06X%08Xh<<%4i",sign,man[h],man[l],+exp);
}
// reconstruct man,exp back to double
double y=double(man[l])*pow(2.0,exp);
y+=double(man[h])*pow(2.0,exp+32.0);
Form1->mm_log->Lines->Add(AnsiString().sprintf("%21.10lf = %s = %21.10lf",x,txt,y));
}
//---------------------------------------------------------------------------
void f32_disect(double x)
{
union _f32 // float bits access
{
float f; // 32bit floating point
U32 u; // 32 bit uint
} f32;
AnsiString txt="";
U32 man;
S32 exp,bias;
char sign='+';
f32.f=x;
bias=_f32_exp_bia>>_f32_exp_pos;
if (f32.u&_f32_sig) sign='-';
exp =(f32.u&_f32_exp)>>_f32_exp_pos;
exp-=bias;
man =f32.u&_f32_man;
if (exp==-bias ) // zero, denormalized
{
exp-=_f32_man_bits-1; // change exp pointing from msb to lsb (ignoring implicit bit)
txt=AnsiString().sprintf("%c%06Xh>>%3i",sign,man,-exp);
}
else if (exp==+bias+1) // Inf,NaN
{
if (man==0) txt=AnsiString().sprintf("%cInf ",sign);
else txt=AnsiString().sprintf("%cNaN ",sign);
man=0; exp=0;
}
else{
exp-=_f32_man_bits; // change exp pointing from msb to lsb
man|=_f32_exp_lsb; // implicit msb mantisa bit for normalized numbers
txt=AnsiString().sprintf("%06X",man);
if (exp<0) txt=AnsiString().sprintf("%c%06Xh>>%3i",sign,man,-exp);
else txt=AnsiString().sprintf("%c%06Xh<<%3i",sign,man,+exp);
}
// reconstruct man,exp back to float
float y=float(man)*pow(2.0,exp);
Form1->mm_log->Lines->Add(AnsiString().sprintf("%21.10f = %s = %21.10f",x,txt,y));
}
//---------------------------------------------------------------------------
//--- Builder: --------------------------------------------------------------
//---------------------------------------------------------------------------
__fastcall TForm1::TForm1(TComponent* Owner):TForm(Owner)
{
mm_log->Lines->Add("[Float]\r\n");
f32_disect(123*pow(2.0,-127-22)); // Denormalizxed
f32_disect(+0.0); // Zero
f32_disect(-0.0); // Zero
f32_disect(+0.0/0.0); // NaN
f32_disect(-0.0/0.0); // NaN
f32_disect(+1.0/0.0); // Inf
f32_disect(-1.0/0.0); // Inf
f32_disect(+123.456); // Normalized
f32_disect(-0.000123); // Normalized
mm_log->Lines->Add("\r\n[Double]\r\n");
f64_disect(123*pow(2.0,-127-22)); // Denormalizxed
f64_disect(+0.0); // Zero
f64_disect(-0.0); // Zero
f64_disect(+0.0/0.0); // NaN
f64_disect(-0.0/0.0); // NaN
f64_disect(+1.0/0.0); // Inf
f64_disect(-1.0/0.0); // Inf
f64_disect(+123.456); // Normalized
f64_disect(-0.000123); // Normalized
mm_log->Lines->Add("\r\n[Fixed]\r\n");
const int n=10;
float fx=12.345,fy=4.321,fm=1<<n;
int x=float(fx*fm);
int y=float(fy*fm);
mm_log->Lines->Add(AnsiString().sprintf("%7.3f + %7.3f = %8.3f = %8.3f",fx,fy,fx+fy,float(int((x+y) ))/fm));
mm_log->Lines->Add(AnsiString().sprintf("%7.3f - %7.3f = %8.3f = %8.3f",fx,fy,fx-fy,float(int((x-y) ))/fm));
mm_log->Lines->Add(AnsiString().sprintf("%7.3f * %7.3f = %8.3f = %8.3f",fx,fy,fx*fy,float(int((x*y)>>n))/fm));
mm_log->Lines->Add(AnsiString().sprintf("%7.3f / %7.3f = %8.3f = %8.3f",fx,fy,fx/fy,float(int((x/y)<<n))/fm
+float(int(((x%y)<<n)/y))/fm));
}
//---------------------------------------------------------------------------
Which might help you understand a bit more ... If you're interested then look also at this:
exponent bias
It was selected as midle between the range edges:
bias = (0+255)/2 = 127
to simply have the same range for positive and negative exponents as possible
modulo
using exp=rexp%127
will not give you negative values from unsigned rexp
no matter what not to mention division is slow operation (at least at the time the specs was created)... That is why exp=rexp-bias