to round a floating point number to a smaller precision
Is this possible with standard library facilities, without having to assume too much about the floating point representation
If we assume the common FLT_RADIX == 2
, easy enough to use frexp()
to extract the significand, round (or truncate), and then reform.
#include <math.h>
#include <stdio.h>
double round3(double value) {
// Get significand
int exp;
printf("%-22a %.20f\n", value, value);
double normalized_fraction = frexp(value, &exp); // [0.5 ... 1.0) or zero,
printf("%-22a %.20f\n", normalized_fraction, normalized_fraction);
// Scale to a whole number range
double normalized_integer = normalized_fraction * pow(FLT_RADIX, DBL_MANT_DIG);
printf("%-22a %.20f %llX\n", normalized_integer, normalized_integer,
(unsigned long long) normalized_integer);
// .. but we really want it scaled by 3 bits less
double normalized_integer_3 = normalized_fraction * pow(FLT_RADIX, DBL_MANT_DIG - 3);
printf("%-22a %.20f\n", normalized_integer_3, normalized_integer_3);
// round
double rounded_normalized_integer_3 = round(normalized_integer_3);
printf("%-22a %.20f\n", rounded_normalized_integer_3, rounded_normalized_integer_3);
// reform
double y = ldexp(rounded_normalized_integer_3, exp - (DBL_MANT_DIG - 3));
printf("%-22a %.20f\n", y, y);
puts("");
return y;
}
Simplified
double round3(double value) {
assert(FLT_RADIX == 2);
int exp;
double normalized_fraction = frexp(value, &exp); // [0.5 ... 1.0) or zero,
double rounded_norm_integer_3 = round(normalized_fraction * pow(2, DBL_MANT_DIG-3));
return ldexp(rounded_norm_integer_3, exp - (DBL_MANT_DIG - 3));
}
Test
int main() {
round3(1.0/3.0);
}
Output
// v --- last hex digit in binary 0101
0x1.5555555555555p-2 0.33333333333333331483
0x1.5555555555555p-1 0.66666666666666662966
0x1.5555555555555p+52 6004799503160661.00000000000000000000 15555555555555
0x1.5555555555555p+49 750599937895082.62500000000000000000
0x1.5555555555558p+49 750599937895083.00000000000000000000
0x1.5555555555558p-2 0.33333333333333348136
// ^ --- last hex digit in binary 1000 (rounded)
To accommodate rare cases when FLT_RADIX != 2
, other considerations are needed to maintain accuracy as frexp()
and ldexp()
work with powers-of-2.
For C++ users, sorry the output uses printf()
, but that is only illustrative and can be removed.