Implemented simple linear regression in C, but much slower with p_thread

Question

I implemented simple linear regression in C, one with single thread and the other with multi-thread which runs much slower.

The single thread version is attached below. It is very simple. First it reads the xs and ys from two files within the same folder. Then, the calculation of the coefficient and intercept of the line is just addition and multiplication of the data from xs and ys.

// gcc simple_regression.cpp -lstdc++ -o simple_regression

#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h> 

const long long int N = 8000000*10;

typedef struct {
    double coef;
    double intercept;
} lr_res;

lr_res lr(double *xs, double *ys) {
    double sumxy = 0.0, sumx = 0.0, sumy = 0.0, sumx2 = 0.0;
    int i;
    for(i=0; i<N; ++i) {
        sumxy += xs[i]*ys[i];
        sumx += xs[i];
        sumy += ys[i];
        sumx2 += xs[i]*xs[i];
        //printf("after adding x[i]=%f, sumx=%f\n", xs[i], sumx);
    }
    //printf("avg(x): %f, avg(y): %f\n", sumx/N, sumy/N);
    lr_res res;
    res.coef = (N*sumxy - sumx*sumy)/(N*sumx2 - sumx*sumx);
    res.intercept = (sumy - res.coef*sumx)/N;
    return res;
}

int main(int argc, char**argv) {

    FILE *x_file, *y_file;
    char *num_cptr = NULL;
    size_t len = 0;
    ssize_t read;

    char *ptr;
    float data;

    //double xs[N], ys[N]; 
    double *xs = new double[N];
    double *ys = new double[N];
    int i;
    //printf("hello!\n");
    i = 0;    
    x_file = fopen("xs.data", "r");    
    while ((read = getline(&num_cptr, &len, x_file)) != -1) {
        xs[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", xs[i++]);
        if (i == N) break;
    }

    i = 0;
    y_file = fopen("ys.data", "r");
    while ((read = getline(&num_cptr, &len, y_file)) != -1) {
        ys[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", ys[i++]);
        if (i == N) break;
    }

    printf("start fitting...\n");

    clock_t t; 
    t = clock(); 

    lr_res res = lr(xs, ys);

    t = clock() - t; 
    double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds 


    printf("coef: %f, intercept: %f\n", res.coef, res.intercept);

    printf("fitting took %f seconds\n", time_taken); 

}

The multi-threaded version is attached below. When calculating the coefficient and intercept, it devides the summation work into different threads, with the hope that the speed can be raised up by roughly 8 times compared the single-threaded version.

// gcc simple_regression_mt.cpp -lstdc++ -lpthread  -o simple_regression_mt
// https://www.statisticshowto.com/what-is-a-regression-equation/

#include <sys/types.h>
#include <sys/wait.h>
#include <sys/sysinfo.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h> 
#include <pthread.h>

#include<string.h> 

const long long int N = 8000000*10;
//const long long int N = 800;
const int nprocs = 8;
int chunksize = N/nprocs;

//double xs[N], ys[N]; 
double *xs = new double[N];
double *ys = new double[N];
double sumxy[nprocs], sumx[nprocs], sumy[nprocs], sumx2[nprocs];


void *lr_calc(void *arg) {
    char* coreid_str = (char *) arg;
    int coreid = strtol(coreid_str, (char **)NULL, 10) - 1;
    int offset = coreid*chunksize;
    //printf("core id:%d starting up...\n", coreid);

    for(int i=offset; i<offset+chunksize; ++i) {
        sumxy[coreid] += xs[i]*ys[i];
        sumx[coreid] += xs[i];
        sumy[coreid] += ys[i];
        sumx2[coreid] += xs[i]*xs[i];
        //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
    }
    //printf("avg(x): %f, avg(y): %f\n", sumx[coreid]/chunksize, sumy[coreid]/chunksize);

    //printf("core id:%d finishing...\n", coreid);

    return NULL;  
}

char* itoa(int val, int base){  
    static char buf[32] = {0};  
    int i = 30; 
    for(; val && i ; --i, val /= base)  
        buf[i] = "0123456789abcdef"[val % base];    
    return &buf[i+1];   
}


int main(int argc, char**argv) {

    printf("This system has %d processors configured and "
        "%d processors available.\n",
        get_nprocs_conf(), get_nprocs());



    FILE *x_file, *y_file;
    char *num_cptr = NULL;
    size_t len = 0;
    ssize_t read;

    char *ptr;
    float data;


    int i;

    // read x, y into dynamic arrays xs, ys
    i = 0;    
    x_file = fopen("xs.data", "r");    
    while ((read = getline(&num_cptr, &len, x_file)) != -1) {
        xs[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", xs[i++]);
        if (i == N) break;
    }

    i = 0;
    y_file = fopen("ys.data", "r");
    while ((read = getline(&num_cptr, &len, y_file)) != -1) {
        ys[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", ys[i++]);
        if (i == N) break;
    }

    // multithreaded version of linear regression
    printf("start fitting...\n");
    struct timeval tp;
    gettimeofday(&tp, NULL);
    int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;

    pthread_t tid[nprocs];
    for (int i=0; i<nprocs; ++i) {
        char* scoreid = itoa(i+1, 10);
        char* scoreid_dup = strdup(scoreid);  

        pthread_create(&tid[i], NULL, lr_calc, (void*)(scoreid_dup));
    }

    void* v;
    for (int i=0; i<nprocs; ++i) {
        pthread_join(tid[i], &v);
    }

    double ssumxy = 0.0, ssumx = 0.0, ssumy = 0.0, ssumx2 = 0.0;
    for (int i=0; i<nprocs; ++i) {
        ssumxy += sumxy[i];
        ssumx += sumx[i];
        ssumy += sumy[i];
        ssumx2 += sumx2[i];
        //printf("sumxy[i]: %f\n", sumxy[i]);
        //printf("ssumxy: %f\n", ssumxy);
    }
    //printf("avg(x): %f, avg(y): %f\n", ssumx/N, ssumy/N);

    double coef = (N*ssumxy - ssumx*ssumy)/(N*ssumx2 - ssumx*ssumx);
    //printf("N*ssumxy is: %f\n", ssumx);
    double intercept = (ssumy - coef*ssumx)/N;

    struct timeval tp1;
    gettimeofday(&tp1, NULL);
    int ms2 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;

    int time_taken = (ms2 - ms); // in million seconds 


    printf("coef: %f, intercept: %f\n", coef, intercept);    
    printf("fitting took %d milli seconds\n", time_taken); 

}

However, the result suggests otherwise as attached below, the multi-threaded version being much slower instead. Note that the timing is only for calculating/fitting the linear regression and doesn't count in the part of reading data from files. The time difference also doesn't seem to be the overhead of creating thread. (creating thread wouldn't take something like 0.5 seconds or so.)

Can anybody give me some advices on this? Thank you so much!

peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 0.440704 seconds
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression_mt 
This system has 8 processors configured and 8 processors available.
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 2370 milli seconds

First of all, creating threads has overhead, if amount of work is smaller than the overhead then it's not worth creating the thread. Second, [why is reading multiple files at the same time slower than reading sequentially](https://stackoverflow.com/questions/42620323/why-is-reading-multiple-files-at-the-same-time-slower-than-reading-sequentially). We don't know what hard disk you are using. Read [this](https://stackoverflow.com/questions/28614953/is-it-a-good-idea-to-read-multiple-files-at-the-same-time). — Tony Tannous, May 17 '20 at 07:14
This isn't C code but C++!? BTW, whichever language you pick, consider putting this code up for review at codereview.stackexchange.com, there is a lot to improve. — Ulrich Eckhardt, May 17 '20 at 07:15
the timing is only for calculating/fitting the linear regression and doesn't count in the part of reading data from files. The time difference also doesn't seem to be the overhead of creating thread. (creating thread wouldn't take something like 0.5 seconds or so.) — pchen, May 17 '20 at 08:21

score 1 · Answer 1 · answered May 17 '20 at 08:40

You have an issue of false-sharing. Right here:

for(int i=offset; i<offset+chunksize; ++i) {
    sumxy[coreid] += xs[i]*ys[i];
    sumx[coreid] += xs[i];
    sumy[coreid] += ys[i];
    sumx2[coreid] += xs[i]*xs[i];
    //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}

Memory doesn't get transferred in bits or bytes but in cache lines - currently they are 64 bytes. If two cores modify data on the same cacheline a complex routine will run to fix the issue. Even if they modified completely different pieces of the cacheline. Since all your cores operate on more-or-less the same cachelines and modify them - expect a huge slowdown by using multiple threads.

To fix it, just create local variables sum**_local for the thread computation and only in the end put them in shared cache.

double sumxy_local = 0., sumx_local = 0., sumy_local = 0., sumx2_local = 0.;
for(int i=offset; i<offset+chunksize; ++i) {
    sumxy_local += xs[i]*ys[i];
    sumx_local  += xs[i];
    sumy_local  += ys[i];
    sumx2_local  += xs[i]*xs[i];
    //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}
sumxy[coreid] = sumxy_local;
sumx[coreid] = sumx_local;
sumy[coreid] = sumy_local;
sumx2[coreid] = sumx2_local;

This should resolve the problem.

Disclaimer: there could be other issues regarding multi-threading that could cause slowdown but this one seems most critical.

After I apply the changes, it only took 140 million seconds now. Thanks! — pchen, May 17 '20 at 09:00
@Lyuen-ChiouChen - wait, 140 **million** seconds? Did you mean 140 milliseconds? — ALX23z, May 17 '20 at 09:08

score 0 · Answer 2 · answered May 17 '20 at 07:23

0

My guess is that the speed benefit of running things in parallel is more than offset by the high cost of the threading machinery. Running instructions flat out will be faster than system calls and library overhead for thread creation and coordination.

answered May 17 '20 at 07:23

9nut

316
2
8

I think the time difference doesn't seem to be the overhead of creating thread. (creating thread wouldn't take something like 0.5 seconds or so.) – pchen May 17 '20 at 08:20

Implemented simple linear regression in C, but much slower with p_thread

2 Answers2