linear fit on log log axis plot does not fit the data

Question

When I try to fit a linear curve to my data points the curve does not match my data. The line is below my data points and the slope is too deep, too. It could be due to starting at x=10 and its corresponding y value, but I am not sure. least square method should be correct, as I tried multiple ways of calculating slope and intercept, but all look methods produce the same output.

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

#define PATH_GNUPLOT "your/path/gnuplot"

void ERROR(char* error_string);

int main() {

    // first 10 data points are irrelevant
    double data[100] = {0,0,0,0,0,0,0,0,0,0, 0.242910, 0.235523, 0.228976, 0.223114, 0.217820, 0.213005, 0.208597, 0.204539, 0.200786, 0.197299, 0.194047, 0.191003, 0.188146, 0.185456, 0.182916, 0.180513, 0.178234, 0.176068, 0.174006, 0.172039, 0.170160, 0.168362, 0.166639, 0.164987, 0.163399, 0.161872, 0.160402, 0.158984, 0.157617, 0.156297, 0.155020, 0.153785, 0.152590, 0.151431, 0.150308, 0.149218, 0.148159, 0.147131, 0.146131, 0.145158, 0.144211, 0.143289, 0.142391, 0.141515, 0.140661, 0.139827, 0.139014, 0.138219, 0.137442, 0.136683, 0.135941, 0.135215, 0.134504, 0.133809, 0.133128, 0.132461, 0.131807, 0.131166, 0.130538, 0.129922, 0.129318, 0.128725, 0.128142, 0.127571, 0.127010, 0.126458, 0.125916, 0.125384, 0.124861, 0.124346, 0.123840, 0.123342, 0.122853, 0.122371, 0.121897, 0.121430, 0.120970, 0.120518, 0.120072, 0.119633, 0.119200, 0.118774, 0.118354, 0.117939, 0.117531, 0.117128, 0.116731, 0.116340, 0.115953, 0.115572};

    int n = 100;

    /* calc fit */
    double sumx, sumy, sumx2, sumxy, a, b = 0.0;

    // => log(y) = a * log(x) + b
    // data starts at 10 for x value
    for (int i = 10; i < n; i++) {

        double x = log(i);
        double y = log(data[i]);

        sumx += x;
        sumx2 += x * x;
        sumy += y;
        sumxy += y * x;
    }

    a = (n * sumxy  -  sumx * sumy) / (n * sumx2 - sumx*sumx);
    b = (sumy * sumx2  -  sumx * sumxy) / (n * sumx2 - sumx*sumx);
    /* end calc fit */

    

    /* save data and plot */
    FILE *fp = fopen("data.dat", "w");
    
    if (fp == NULL) {
        ERROR("failed to open data.dat");
    }
    
    for (int i = 10; i < n; i++)
        fprintf(fp, "%d %.16lf\n", i, data[i]);
    
    fclose(fp);

    FILE *GNUpipe = popen(PATH_GNUPLOT, "w");
    
    if (GNUpipe == NULL) {
        ERROR("failed to open gnuplot");
    }
    
    fprintf(GNUpipe, "set term png\n");
    fprintf(GNUpipe, "set logscale xy\n");

    // log(y) = a * log(x) + b
    // => y = x**a + 10**b              
    fprintf(GNUpipe, "Fit(x) = x**%lf * 10**%lf\n", a, b);
    
    fprintf(GNUpipe, "plot [0.1:%d*1.1] 'data.dat' title 'error', ", n);

    fprintf(GNUpipe, "Fit(x) title 'fit'\n");
    
    pclose(GNUpipe);

}

void ERROR(char* error_string) {
    
    fprintf(stderr, error_string);
    
    exit(EXIT_FAILURE);
}

EDIT:

Code so far with all changes implemented:

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

#define PATH_GNUPLOT "your/path/gnuplot"

void ERROR(char* error_string);

int main() {

    // first 10 data points are irrelevant
    double data[100] = {0,0,0,0,0,0,0,0,0,0, 0.242910, 0.235523, 0.228976, 0.223114, 0.217820, 0.213005, 0.208597, 0.204539, 0.200786, 0.197299, 0.194047, 0.191003, 0.188146, 0.185456, 0.182916, 0.180513, 0.178234, 0.176068, 0.174006, 0.172039, 0.170160, 0.168362, 0.166639, 0.164987, 0.163399, 0.161872, 0.160402, 0.158984, 0.157617, 0.156297, 0.155020, 0.153785, 0.152590, 0.151431, 0.150308, 0.149218, 0.148159, 0.147131, 0.146131, 0.145158, 0.144211, 0.143289, 0.142391, 0.141515, 0.140661, 0.139827, 0.139014, 0.138219, 0.137442, 0.136683, 0.135941, 0.135215, 0.134504, 0.133809, 0.133128, 0.132461, 0.131807, 0.131166, 0.130538, 0.129922, 0.129318, 0.128725, 0.128142, 0.127571, 0.127010, 0.126458, 0.125916, 0.125384, 0.124861, 0.124346, 0.123840, 0.123342, 0.122853, 0.122371, 0.121897, 0.121430, 0.120970, 0.120518, 0.120072, 0.119633, 0.119200, 0.118774, 0.118354, 0.117939, 0.117531, 0.117128, 0.116731, 0.116340, 0.115953, 0.115572};

    int n = 100;

    /* calc fit */
    double sumx = 0.0; 
    double sumy = 0.0; 
    double sumx2 = 0.0;
    double sumxy = 0.0;
    double a = 0.0;
    double b = 0.0;

    // => log(y) = a * log(x) + b
    // data starts at 10 for x value
    for (int i = 10; i < n; i++) {

        double x = log(i);
        double y = log(data[i]);

        sumx += x;
        sumx2 += x * x;
        sumy += y;
        sumxy += y * x;
    }

    int N = n - 10;
    double denom = N*sumx2 - sumx * sumx;
    b = (sumy * sumx2 - sumx * sumxy)/ denom;
    a = (N*sumxy - sumx * sumy)/ denom;

    printf("%g %g\n",a , b);
    /* end calc fit */

    

    /* save data and plot */
    FILE *fp = fopen("data.dat", "w");
    
    if (fp == NULL) {
        ERROR("failed to open data.dat");
    }
    
    for (int i = 10; i < n; i++)
        fprintf(fp, "%d %.16le\n", i, data[i]);
    
    fclose(fp);

    FILE *GNUpipe = popen(PATH_GNUPLOT, "w");
    
    if (GNUpipe == NULL) {
        ERROR("failed to open gnuplot");
    }
    
    fprintf(GNUpipe, "set term png\n");
    fprintf(GNUpipe, "set logscale xy\n");

    // log(y) = a * log(x) + b
    // => y = x**a * 10**b              
    fprintf(GNUpipe, "Fit(x) = x**%.16lf * 10**%.16lf\n", a, b);
    
    fprintf(GNUpipe, "plot [0.1:%d*1.1] 'data.dat' title 'error', ", n);

    fprintf(GNUpipe, "Fit(x) title 'fit'\n");
    
    pclose(GNUpipe);

}

void ERROR(char* error_string) {
    
    fprintf(stderr, error_string);
    
    exit(EXIT_FAILURE);
}

"data.dat": link stdout: -0.323977 -0.66909 for printf("%g %g\n",a , b);

`double sumx, sumy, sumx2, sumxy, a, b = 0.0;` only initialises `b` to `0.0`. All the other variables in that line are uninitialised and probably contain random junk. — pmacfarlane, May 01 '23 at 15:28
@Max, Code comment `// => y = x**a + 10**b` looks amiss. Should it be `// => y = x**a * 10**b`? — chux - Reinstate Monica, May 01 '23 at 17:02
Yes, fixed the comment. Was correct in the fprintf statement, tho. — Max, May 01 '23 at 17:04

chux - Reinstate Monica · Accepted Answer · 2023-05-01T17:22:52.947

3

At least these problems:

Need to zero initialize

// double sumx, sumy, sumx2, sumxy, a, b = 0.0; 
double sumx = 0.0;
double sumy = 0.0;
...

Save time

Enable more warnings: "warning: excess elements in array initializer"

Too many initializers

// double data[100] = {  101 initializers };

Avoid losing precision

// fprintf(GNUpipe, "Fit(x) = x**%lf * 10**%.*le\n", a, b);
fprintf(GNUpipe, "Fit(x) = x**%.16le * 10**%.16le\n", a, b);

// fprintf(fp, "%d %.16lf\n", i, data[i]);
fprintf(fp, "%d %.16le\n", i, data[i]);

Wrong n

Code uses n. Since, for (int i = 10; i < n; i++), should be n-10 in a, b formulas.

Maybe wrong formula?

Looks wrong to me, need deeper review.

// ??
a = (n * sumxy  -  sumx * sumy) / (n * sumx2 - sumx*sumx);
b = (sumy * sumx2  -  sumx * sumxy) / (n * sumx2 - sumx*sumx);

I expect: ref.

N = n - 10;
double denom = N*sumx2 - sumx * sumx;
a = (sumy * sumx2 - sumx * sumxy)/ denom;
b = (N*sumxy - sumx * sumy)/ denom;

Ah, I see we have a, b reversed.

[Append]

Wrong base

Code is using log() (log base-e). Likely wants log10() if "Fit(x) = x**%lf * 10**%lf\n" is important.

edited May 01 '23 at 17:22

answered May 01 '23 at 15:38

chux - Reinstate Monica

143,097
13
135
256

Sorry, maybe copied one date too much. Data (array) and data.dat (file) should look something like this https://pastebin.com/dY699gjN. Deleted the last date in the array. – Max May 01 '23 at 15:42
1

@max To be clear, the real mistake is not enabling all warnings, not so much _too many initializers_. – chux - Reinstate Monica May 01 '23 at 15:44
When compiling like `gcc .\stackoverflow_minExample.c -std=gnu99 -lm`, I am not getting any warnings. How do I enable them? – Max May 01 '23 at 15:46
@Max `-Wall` should be the flag. – AnArrayOfFunctions May 01 '23 at 15:48
1

Try "-pedantic -Wall -Wextra -Wconversion". – chux - Reinstate Monica May 01 '23 at 15:48
Add `-Wshadow` and then `-Werror` (to keep yourself honest `:)` (in addition to what @chux-ReinstateMonica said) Confused by all the options? Perfect time to learn. See [man 1 gcc](https://man7.org/linux/man-pages/man1/gcc.1.html) – David C. Rankin May 01 '23 at 15:51
@chux-ReinstateMonica x starts at value 10. So the first data point is `(10, data[10])` – Max May 01 '23 at 15:58
and what I am supposed to do with that ref link? – Max May 01 '23 at 15:59
I used the code from this answer https://stackoverflow.com/questions/5083465/fast-efficient-least-squares-fit-algorithm-in-c – Max May 01 '23 at 16:01
1

Ummmm ... look up what each of the options do for you to start with `:)` That way you learn what options are available to compile your code. When you begin using an IDE -- you can tell the IDE how you want your code compiled -- instead of hoping the IDE set the options you need and enabled all warnings for you (they don't) – David C. Rankin May 01 '23 at 16:03
@Max "x starts at value 10. So the first data point is (10, data[10])" --> do not use `n` in `a = ..., b = ...`, use `(n-10)`. – chux - Reinstate Monica May 01 '23 at 16:05
The line appears to be parallel now, but the intercept is still off https://imgur.com/a/Xw841GL – Max May 01 '23 at 16:55
@Max 1) Note minor change to `fprintf(fp, "%d %.16le\n", i, data[i]);`. 2) Try `printf("%g %g\n",a , b);`. What do you get? – chux - Reinstate Monica May 01 '23 at 17:07
`a: -0.323978, b: -0.669088` returns your statement in my code – Max May 01 '23 at 17:09
I get `-0.323977 -0.66909`. I do not see our difference as significant to your plot, yet curious. Perhaps it is time to _append_ your current solution to the question. Also post `stdout` and `"data.dat"`. – chux - Reinstate Monica May 01 '23 at 17:12
@Max Do you want `log()` or `log10()`? – chux - Reinstate Monica May 01 '23 at 17:23
Updated answer. I think I do want `log10()`. Gnuplot sets logscale with log10, right? So this "linear" correlation should be in log10? – Max May 01 '23 at 17:25
1

yes, `log10()`with all your previous effort has fixed it. – Max May 01 '23 at 17:26

linear fit on log log axis plot does not fit the data

1 Answers1