Simple C example of add/sub/mul/div operations in double-precision floating-points using a single-precision Floating-point system

Question

I am working on an algorithm which requires calculations in large numbers, upto e+30. I am using a 32 bit system with compiler support of 32 bits for long/float/double. So far, by searching online, I've learned that single-precision floating points (FPs) can be used for Double-precision FPs.

From this question asked by someone earlier (Emulate “double” using 2 “float”s) I found this paper which has the algorithm to work with Double-precision FPs in GPUs. It is too confusing for me to implement in C. I just need four basic mathematical operations. Is there any way I could find an example for this which will help me understand it better?

Thanks in advance.

Here is the Code I am working on. It might have errors i can not see, any suggestions would be appreciated to rectify error but that is preety much what I am trying to implement. In the algorithm, POLYNOMIAL_ORDER should be able to go up to forth order (can settle at Third order if the standard deviation is smaller). Few things I am not sure about are 1) Procedures make_float() and make_float() are correct or not, 2) Use of make_float() in the program.

#define POLYNOMIAL_ORDER    (3)
#define TC_TABLE_SIZE   (14)

typedef struct vector_float2{
float x;
float y;
}float2;

typedef struct
{
    float tc0;
    float tc1;
    float tc2;
    float tc3;
}POLYNOMIALS;

typedef struct  {
    int16_t Temp;
    int16_t Comp;   
} TempCompPair;

volatile TempCompPair TCtable[TC_TABLE_SIZE] = {{22452,1651},
                                                {25318,1444},
                                                {28268,1133},
                                                {31120,822},
                                                {34027,511},
                                                {36932,185},
                                                {39770,-81},
                                                {42685,-288},
                                                {45531,-407},
                                                {48425,-632},
                                                {51401,-703},
                                                {54460,-1143},
                                                {57202,-1420},
                                                {60027,-1652}};

POLYNOMIALS polynomials;
float matrix[TC_TABLE_SIZE][TC_TABLE_SIZE] = {0};
float average[TC_TABLE_SIZE] = {0};

float make_float(float x, float y)
{
    return x+y;
}

float2 make_float2(float a, float b)
{
    float2 f2 = {a,b};
    return f2;
}

float2 quickTwoSum(float a, float b)
{
    float s = a+b;
    float e = b - (s - a);

    float2 result = {s, e};
    return result;
}

float2 twoSum(float a, float b)
{
    volatile float s = a + b;
    float v = s - a;
    float e = (a - (s - v)) + (b - v);
    float2 result = {s , e};
    return result;
}

float2 df64_add(float2 a, float2 b)
{
    float2 s,t;
    s = twoSum(a.x, b.x);
    t = twoSum(a.y, b.y);
    s.y += t.x;
    s = quickTwoSum(s.x, s.y);
    s.y += t.y;
    s = quickTwoSum(s.x, s.y);
    return s;
}

float2 split(float a)
{
    const float split = 4097;       //(1<<12) + 1
    float t = a *split;
    float a_hi = t - (t - a);
    float a_lo = a - a_hi;
    float2 result = {a_hi, a_lo};
    return result;
}

float2 twoProd(float a, float b)
{
    float p = a*b;
    float2 aS = split(a);
    float2 bS = split(b);
    float err = ((aS.x * bS.x - p)
                + aS.x * bS.y + aS.y * bS.x)
                + aS.y * bS.y;

    float2 result = {p, err};
    return result;
}

float2 df64_mult(float2 a, float2 b)
{
    float2 p;

    p = twoProd(a.x,b.x);
    p.y += a.x * b.y;
    p.y += a.y * b.x;
    p = quickTwoSum(p.x,p.y);

    return p;
}

float2 calculate_power(float base, int pow)
{
    int i = 0;

    float2 base_f2 = make_float2(base,0);
    float2 result_f2 = {1,0};

    if(pow == 0)
    {
        return result_f2;
    }

    if(pow > 0)
    {
        if(pow == 1)
        {
            return base_f2;
        }
        else
        {
            for(i = 0; i < pow; i++)
            {
                result_f2 = df64_mult(result_f2,base_f2);
            }
            return result_f2;
        }
    }
    else
    {
        return result_f2;
        //Mechanism for negative powers
    }

}

void TComp_Polynomial()
{
    int i;
    int j;
    int k;
    int size;
    float temp;
    float2 sum = {0,0};
    float2 result0 = {0,0};
    float2 result1 = {0,0};

    float x[TC_TABLE_SIZE];
    float y[TC_TABLE_SIZE];

    for(i = 0; i < TC_TABLE_SIZE; i++)
    {
        x[i] = (float) TCtable[i].Temp;
        y[i] = (float) TCtable[i].Comp;
    }

    size = i;

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        for(j = 0; j <= POLYNOMIAL_ORDER; j++)
        {
            sum.x = 0;
            sum.y = 0;

            for(k = 0; k < size; k++)
            {
                // Expression simplified below:  **sum += pow(x[k],i+j)** 
                result0 = calculate_power(x[k], i+j);
                sum = df64_add(result0,sum);
            }

            matrix[i][j] = make_float(sum.x,sum.y);
        }
    }

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        sum.x = 0;
        sum.y = 0;

        for(j = 0; j < size; j++)
        {
            // Expression simplified below: **sum += y[j] * pow(x[j],i)**
            result0 = calculate_power(x[j], i);
            result1 = df64_mult( result0 , make_float2(y[j],0) );
            sum = df64_add(result1,sum);
        }

        average[i] = make_float(sum.x,sum.y);
    }

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        for(j = 0; j <= POLYNOMIAL_ORDER; j++)
        {
            if(j != i)
            {
                if(matrix[i][i]!= 0)
                {
                    temp = matrix[j][i]/matrix[i][i];
                }

                for(k = i; k < POLYNOMIAL_ORDER; k++)
                {
                    matrix[j][k] -= temp*matrix[i][k];
                }
                average[j] -= temp*average[i];

            }
        }
    }

    if(matrix[0][0] != 0)
    {
        polynomials.tc0 = average[0]/matrix[0][0];
    }
    if(matrix[1][1] != 0)
    {
        polynomials.tc1 = average[1]/matrix[1][1];
    }

    if(matrix[2][2] != 0)
    {
        polynomials.tc2 = average[2]/matrix[2][2];
    }

    if(matrix[3][3] != 0)
    {
        polynomials.tc3 = average[3]/matrix[3][3];
    }
}

and then use the struct polynomials.tc0/1/2/3 in below expression

// Y = T^3 * X3 + T^2 * X2 + T^1 * X1 + X0 ;

double calculate_equation(uint16_t TEMP)
{
    double Y;

    if(POLYNOMIAL_ORDER == 1)
    {
        Y = polynomials.tc1*(double)TEMP + polynomials.tc0; 
    }
    else if(POLYNOMIAL_ORDER == 2)
    {
        Y = (polynomials.tc2 * (double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0;  
    }
    else if(POLYNOMIAL_ORDER == 3)
    {
        Y = ((polynomials.tc3 * (double)TEMP + polynomials.tc2)*(double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0; 
    }
    else if(POLYNOMIAL_ORDER == 4)
    {
        Y = (((polynomials.tc4 * (double)TEMP + polynomials.tc3)*(double)TEMP + polynomials.tc2)*(double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0;    
    }

    return Y;
}

And standard Deviation is calculated is as follows:

//sqrt(sigma(error^2))
for(i = 0; i < TC_TABLE_SIZE; i++)
    {
        actual_comp[i] =(int) calculate_equation(TCtable[i].Temp);
        error[i] = TCtable[i].Comp - actual_comp[i] ;
        error_sqr += error[i]*error[i];

        printf("%u\t%d\t\t%e\n", TCtable[i].Temp, TCtable[i].Comp, actual_comp[i] );
    }
    error_sqrt = sqrt(error_sqr);

Reference: http://hal.archives-ouvertes.fr/docs/00/06/33/56/PDF/float-float.pdf Guillaume Da Graça, David Defour Implementation of float-float operators on graphics hardware, 7th conference on Real Numbers and Computers, RNC7.

Why do you need more than 24 bits of precision? If you need exact math, then you should be doing fixed-point arithmetic. — stark, Jul 25 '17 at 17:37
maybe you need a big number library like https://gmplib.org/ — pm100, Jul 25 '17 at 20:19
Algorithm I want to implement is third-degree polynomial. I think fixed point arithmetic requires some assumptions about the largest size in the algorithm. In my case it can go upto (range of uint16) ^ 6 ( ). Correct me if i am wrong. But thanks for tossing that method here, Always good to know yet another programming technique. — Rishit, Jul 25 '17 at 20:35
@pm100, i have limitations on code size and also on the use of open-source libraries. Thanks for the suggestions though. — Rishit, Jul 25 '17 at 20:36
Single-precision goes up to 3.4e38 in magnitude. Also, consider using [Horner's method](https://en.wikipedia.org/wiki/Horner%27s_method#Description_of_the_algorithm) and [Kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm). It would help if you put in your question how ill-conditioned your polynomials are and the range of your inputs. — Iwillnotexist Idonotexist, Jul 25 '17 at 20:41
I recommend to implement your code using `double` and then point out its deficiencies -else this is unclear/too broad. As it stands, what you need and what C offers through its standard FP type are likely sufficient when employed in the correct fashion. Show what you have tried. — chux - Reinstate Monica, Jul 27 '17 at 03:03
@chux, I am not sure with the double's compliance in C. I am using sizeof(double) to get the size and it is 4. I am using an older compiler version from TI V4.0.0, that might explain it. I will put up the code once I have something presentable, mostly by tonight or morning tomorrow. Thanks for extending a helping hand. — Rishit, Jul 27 '17 at 21:06
I too have used platforms that use a non-compliant `double` obliging code to attempt a pair of `float` or the like. IMO,detailing the higher level goal may be a better approach. Not only what are the largest values, but what precision is needed? I would not be surprised to find `float` will work for you. — chux - Reinstate Monica, Jul 27 '17 at 21:12
1) This may be better at [code review](https://codereview.stackexchange.com/) than here. Check its guidelines. 2) `twoSum()`has a problem in that a compiler is allowed to defeat the separation of calculation intended. Using `volatile float` is one way to work around that. 3) The iterative loop in `calculate_power()` is weak. Only log2(power) iterations needed and that has better precision too. 4) `TComp_Polynomial()` is weak . Consider `(((a*x + b)*x + c)*x +d)*x + e` model instead. — chux - Reinstate Monica, Jul 28 '17 at 13:53
Why 4097 instead of 4096? Scaling by 4097 immediately introduces rounding (error) wheres as *4096 could be exact. — chux - Reinstate Monica, Jul 28 '17 at 14:00
@chux: 4097, I am using it directly from the paper (page 5) I have referenced at the bottom of this question. Working on other suggestions and if you could elaborate more on point 2. Thanks. — Rishit, Jul 28 '17 at 14:27
With `float s = a + b; float v = s - a;` , code relies on the subtraction using the _true_ value of `s`. C is allowed to perform math at a higher precision than `float` (See `FLT_EVAL_METHOD`) so the sum may be retained in a `double` FP internal variable and it is that variable that is used in the subsequent subtraction, not the stored `float` one. This optimization defeats the code's intent. Although this may not happen on your target machine, it may happen on a reference machine and be the source of inconsistent results. Using `volatile float s = a + b;` is one way to avoid this. — chux - Reinstate Monica, Jul 28 '17 at 14:43
@chux: https://codereview.stackexchange.com/questions/171423/calculating-polynomial-computing-very-large-numbers-emulating-double-using-2-fl — Rishit, Jul 28 '17 at 14:44
@RishitArvindBorad I shall look at [it](https://codereview.stackexchange.com/questions/171423/calculating-polynomial-computing-very-large-numbers-emulating-double-using-2-fl) when time permits a quality review. — chux - Reinstate Monica, Jul 28 '17 at 14:45
@chux: you are right, i dont see FLT_EAL_METHOD on the target compiler (TI's V4.0.0.0 B1) and its there on the reference compiler. As a safe option I've edited the code. Thanks. And whenever you are ready to review the code, by the time i can work on other suggestions and options. — Rishit, Jul 28 '17 at 15:10
@chux: Your first comment "As it stands, what you need and what C offers through its standard FP type are likely sufficient when employed in the correct fashion." Worked for me. Its working in single precision FP. I will post the solution shortly.Thanks. — Rishit, Aug 04 '17 at 14:07
@RishitArvindBorad I did peruse the [ref](http://hal.archives-ouvertes.fr/docs/00/06/33/56/PDF/float-float.pdf) and your [code review post](https://codereview.stackexchange.com/q/171423/29485) yet did not find time for a quality response. Good you are deriving a solution. — chux - Reinstate Monica, Aug 04 '17 at 14:15

score 0 · Accepted Answer · answered Aug 07 '17 at 20:08

I was able to implement this code without using double precision as the calculations were in the range of Float. Here's my implementation, let me know if I can optimize it better.

typedef struct
{   int64_t tc0;
    int64_t tc1;
    int64_t tc2;
    int64_t tc3;
    int64_t tc4;
}POLYNOMIALS;

POLYNOMIALS polynomials = {0,0,0,0,0};
int16_t TempCompIndex;
int64_t x[TC_TABLE_SIZE];
int64_t y[TC_TABLE_SIZE];

float matrix[POLYNOMIAL_ORDER+1][POLYNOMIAL_ORDER+1] = {0};
float average[POLYNOMIAL_ORDER+1] = {0};

void TComp_Polynomial()
{
    int i;
    int j;
    int k;
    int size;
    float temp;
    float sum = 0;
    float powr = 0;
    float prod;

    int64_t x[TC_TABLE_SIZE];
    int64_t y[TC_TABLE_SIZE];

    for(i = 0; i < TC_TABLE_SIZE; i++)
    {
        x[i] = (int64_t) TCtable[i].Temp;
        y[i] = (int64_t) TCtable[i].Comp<<PRECISION;
        printf("x: %lld, y:%lld\n",x[i],y[i]);
    }

    size = i;

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        for(j = 0; j <= POLYNOMIAL_ORDER; j++)
        {
            sum = 0;
            powr = 0;
            for(k = 0; k < size; k++)
            {       
                //printf("x[%d]: %ld, i: %d ,j: %d ", k, x[k],i,j);
                powr = pow(x[k],i+j);
                //printf("Power: %f, sum: %f\n ",powr,sum);
                sum +=  powr;
                //printf("%f\r\n",powr);
                //printf("sum: %lf\n",sum );
            }

            matrix[i][j] = sum;
            printf("sum: %g\n",sum);
        }
    }

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        sum = 0;
        powr = 0;

        for(j = 0; j < size; j++)
        {
            //sum += y[j] * pow(x[j],i)
            //printf("sum: %lf, y[%d]: %lf, x[%d]: %lf^%d  ",sum,j,y[j], i, x[j],i);
            //printf("x[%d]:%lld ^ %d\t",j,x[j],i);
            powr = (float) pow(x[j],i);
            printf("powr: %f\t",powr);

            prod = (float) y[j] * powr;
            printf("prod:%f \t %lld \t", prod,y[j]);

            sum += (float) prod;
            printf("sum: %f \n",sum);
        }

        average[i] = sum;
        //printf("#Avg: %f\n",average[i]);
    }
    printf("\n\n");

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        for(j = 0; j <= POLYNOMIAL_ORDER; j++)
        {
            if(j != i)
            {   
                if(matrix[i][i]!= 0)
                {
                    //printf("matrix%d%d: %g / matrix%d%d: %g =\t ",j,i,matrix[j][i],i,i,matrix[i][i]);
                    temp = matrix[j][i]/matrix[i][i];
                    //printf("Temp: %g\n",temp);
                }   

                for(k = i; k < POLYNOMIAL_ORDER; k++)
                {   
                    matrix[j][k] -= temp*matrix[i][k];
                    //printf("matrix[%d][%d]:%g, %g, matrix[%d][%d]:%g\n",j,k,matrix[j][k], temp,i,k,matrix[i][k]);
                }
                //printf("\n\n");
                //print_matrix();
                printf("\n\n");

                //printf("avg%d: %g\ttemp: %g\tavg%d: %g\n\n",j,average[j],temp,i,average[i]);
                average[j] -= temp*average[i];
                printf("#Avg%d:%g\n",j,average[j]);
                //print_average();
            }
        }
    }

    print_matrix();
    print_average();




/* Calculate polynomial Coefficients (n+1) based on the POLYNOMIAL_ORDER (n) */
#ifndef POLYNOMIAL_ORDER

#elif POLYNOMIAL_ORDER == 0
    if(matrix[0][0] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
    }
#elif POLYNOMIAL_ORDER == 1
    if(matrix[1][1] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
        polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
    }
#elif POLYNOMIAL_ORDER == 2
    if(matrix[2][2] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
        polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
        polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
    }
#elif POLYNOMIAL_ORDER == 3
    if(matrix[3][3] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
        polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
        polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
        polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
    }
#elif POLYNOMIAL_ORDER == 4
    if(matrix[4][4] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
        polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
        polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
        polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
        polynomials.tc4 = (int64_t) (average[4]/matrix[4][4]);
    }
#endif

    }



 int16_t calculate_equation(uint16_t TEMP)
{
    int64_t Y = 0;
    int16_t TempComp = 0;

#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
        Y = polynomials.tc0;
#elif POLYNOMIAL_ORDER == 1
        Y = polynomials.tc1* ((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 2
        Y = (polynomials.tc2 * ((int64_t)TEMP) + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 3
        Y = ((polynomials.tc3 * ((int64_t)TEMP) + polynomials.tc2)*((int64_t)TEMP) + polynomials.tc1)*((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 4
        Y = (((polynomials.tc4 * (int64_t)TEMP + polynomials.tc3)*(int64_t)TEMP + polynomials.tc2)*(int64_t)TEMP + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#endif
    TempComp = (int16_t) (Y>>PRECISION_BITS);

    return TempComp;
}

void main(){
int16_t TempComp = 0;
TempCompValue = (int16_t) calculate_equation(Mon_Temp);
}

Note: Calculate_Equation() is being called once a second and it is required to not use float in order to avoid floating point arithmetic, hence I am using non-float variables in that function.

It is working right for me and haven't discovered any bug after initial testing. Thanks every one for taking interest in my post, if not the answer, got to learn some new techniques. And thanks @chux.

Simple C example of add/sub/mul/div operations in double-precision floating-points using a single-precision Floating-point system

1 Answers1