Based on the question convert from float-point to custom numeric type, I figured out a portable safe way to convert float-point type into array of integers and the code works fine, but for some values when converting from double
to unsigned long long
with precision that can be safely represented by unsigned long long
the conversion fails not by compile-time error but with invalid value which is minimum representable value for signed long long
or zero, the conversion fails on visual c++ 2008, intel xe 2013 and gcc 4.7.2.
here is the code: (notice first statement inside while
loop in main
function)
#ifndef CHAR_BIT
#include <limits.h>
#endif
#include <float.h>
#include <math.h>
typedef signed int int32;
typedef signed long long int64;
typedef unsigned int uint32;
typedef unsigned long long uint64;
typedef float float32;
typedef double float64;
// get size of type in bits corresponding to CHAR_BIT.
template<typename t>
struct sizeof_ex
{
static const uint32 value = sizeof(t) * CHAR_BIT;
};
// factorial function
float64 fct(int32 i)
{
float64 r = 1;
do r *= i; while(--i > 1);
return r;
}
int main()
{
// maximum 2 to power that can be stored in uint32
const uint32 power_2 = uint32(~0);
// number of binary digits in power_2
const uint32 digit_cnt = sizeof_ex<uint32>::value;
// number of array elements that will store expanded value
const uint32 comp_count = DBL_MAX_EXP / digit_cnt + uint32((DBL_MAX_EXP / digit_cnt) * digit_cnt < DBL_MAX_EXP);
// array elements
uint32 value[comp_count];
// get factorial for 23
float64 f = fct<float64>(23);
// save sign for later correction
bool sign = f < 0;
// remove sign from float-point if exists
if (sign) f *= -1;
// get number of binary digits in f
uint32 actual_digits = 0;
frexp(f, (int32*)&actual_digits);
// get start index in array for little-endian format
uint32 start_index = (actual_digits / digit_cnt) + uint32((actual_digits / digit_cnt) * digit_cnt < actual_digits) - 1;
// get all parts but the last
while (start_index > 0)
{
// store current part
// in this line the compiler fails
value[start_index] = uint64(f / power_2);
// exclude it from f
f -= power_2 * float64(value[start_index]);
// decrement index
--start_index;
}
// get last part
value[0] = uint32(f);
}
The convert code above will give different result from compiler to another, meaning when the parameter of factorial function say 20 all compilers return valid result, when the value greater than 20 some compiler gets part of the result others don't and when it is get bigger e.g. 35
it become zero.
please tell me why those error occurs?
thank you.