I implemented simple linear regression in C, one with single thread and the other with multi-thread which runs much slower.
The single thread version is attached below. It is very simple. First it reads the xs and ys from two files within the same folder. Then, the calculation of the coefficient and intercept of the line is just addition and multiplication of the data from xs and ys.
// gcc simple_regression.cpp -lstdc++ -o simple_regression
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
const long long int N = 8000000*10;
typedef struct {
double coef;
double intercept;
} lr_res;
lr_res lr(double *xs, double *ys) {
double sumxy = 0.0, sumx = 0.0, sumy = 0.0, sumx2 = 0.0;
int i;
for(i=0; i<N; ++i) {
sumxy += xs[i]*ys[i];
sumx += xs[i];
sumy += ys[i];
sumx2 += xs[i]*xs[i];
//printf("after adding x[i]=%f, sumx=%f\n", xs[i], sumx);
}
//printf("avg(x): %f, avg(y): %f\n", sumx/N, sumy/N);
lr_res res;
res.coef = (N*sumxy - sumx*sumy)/(N*sumx2 - sumx*sumx);
res.intercept = (sumy - res.coef*sumx)/N;
return res;
}
int main(int argc, char**argv) {
FILE *x_file, *y_file;
char *num_cptr = NULL;
size_t len = 0;
ssize_t read;
char *ptr;
float data;
//double xs[N], ys[N];
double *xs = new double[N];
double *ys = new double[N];
int i;
//printf("hello!\n");
i = 0;
x_file = fopen("xs.data", "r");
while ((read = getline(&num_cptr, &len, x_file)) != -1) {
xs[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", xs[i++]);
if (i == N) break;
}
i = 0;
y_file = fopen("ys.data", "r");
while ((read = getline(&num_cptr, &len, y_file)) != -1) {
ys[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", ys[i++]);
if (i == N) break;
}
printf("start fitting...\n");
clock_t t;
t = clock();
lr_res res = lr(xs, ys);
t = clock() - t;
double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds
printf("coef: %f, intercept: %f\n", res.coef, res.intercept);
printf("fitting took %f seconds\n", time_taken);
}
The multi-threaded version is attached below. When calculating the coefficient and intercept, it devides the summation work into different threads, with the hope that the speed can be raised up by roughly 8 times compared the single-threaded version.
// gcc simple_regression_mt.cpp -lstdc++ -lpthread -o simple_regression_mt
// https://www.statisticshowto.com/what-is-a-regression-equation/
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/sysinfo.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <pthread.h>
#include<string.h>
const long long int N = 8000000*10;
//const long long int N = 800;
const int nprocs = 8;
int chunksize = N/nprocs;
//double xs[N], ys[N];
double *xs = new double[N];
double *ys = new double[N];
double sumxy[nprocs], sumx[nprocs], sumy[nprocs], sumx2[nprocs];
void *lr_calc(void *arg) {
char* coreid_str = (char *) arg;
int coreid = strtol(coreid_str, (char **)NULL, 10) - 1;
int offset = coreid*chunksize;
//printf("core id:%d starting up...\n", coreid);
for(int i=offset; i<offset+chunksize; ++i) {
sumxy[coreid] += xs[i]*ys[i];
sumx[coreid] += xs[i];
sumy[coreid] += ys[i];
sumx2[coreid] += xs[i]*xs[i];
//printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}
//printf("avg(x): %f, avg(y): %f\n", sumx[coreid]/chunksize, sumy[coreid]/chunksize);
//printf("core id:%d finishing...\n", coreid);
return NULL;
}
char* itoa(int val, int base){
static char buf[32] = {0};
int i = 30;
for(; val && i ; --i, val /= base)
buf[i] = "0123456789abcdef"[val % base];
return &buf[i+1];
}
int main(int argc, char**argv) {
printf("This system has %d processors configured and "
"%d processors available.\n",
get_nprocs_conf(), get_nprocs());
FILE *x_file, *y_file;
char *num_cptr = NULL;
size_t len = 0;
ssize_t read;
char *ptr;
float data;
int i;
// read x, y into dynamic arrays xs, ys
i = 0;
x_file = fopen("xs.data", "r");
while ((read = getline(&num_cptr, &len, x_file)) != -1) {
xs[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", xs[i++]);
if (i == N) break;
}
i = 0;
y_file = fopen("ys.data", "r");
while ((read = getline(&num_cptr, &len, y_file)) != -1) {
ys[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", ys[i++]);
if (i == N) break;
}
// multithreaded version of linear regression
printf("start fitting...\n");
struct timeval tp;
gettimeofday(&tp, NULL);
int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;
pthread_t tid[nprocs];
for (int i=0; i<nprocs; ++i) {
char* scoreid = itoa(i+1, 10);
char* scoreid_dup = strdup(scoreid);
pthread_create(&tid[i], NULL, lr_calc, (void*)(scoreid_dup));
}
void* v;
for (int i=0; i<nprocs; ++i) {
pthread_join(tid[i], &v);
}
double ssumxy = 0.0, ssumx = 0.0, ssumy = 0.0, ssumx2 = 0.0;
for (int i=0; i<nprocs; ++i) {
ssumxy += sumxy[i];
ssumx += sumx[i];
ssumy += sumy[i];
ssumx2 += sumx2[i];
//printf("sumxy[i]: %f\n", sumxy[i]);
//printf("ssumxy: %f\n", ssumxy);
}
//printf("avg(x): %f, avg(y): %f\n", ssumx/N, ssumy/N);
double coef = (N*ssumxy - ssumx*ssumy)/(N*ssumx2 - ssumx*ssumx);
//printf("N*ssumxy is: %f\n", ssumx);
double intercept = (ssumy - coef*ssumx)/N;
struct timeval tp1;
gettimeofday(&tp1, NULL);
int ms2 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;
int time_taken = (ms2 - ms); // in million seconds
printf("coef: %f, intercept: %f\n", coef, intercept);
printf("fitting took %d milli seconds\n", time_taken);
}
However, the result suggests otherwise as attached below, the multi-threaded version being much slower instead. Note that the timing is only for calculating/fitting the linear regression and doesn't count in the part of reading data from files. The time difference also doesn't seem to be the overhead of creating thread. (creating thread wouldn't take something like 0.5 seconds or so.)
Can anybody give me some advices on this? Thank you so much!
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 0.440704 seconds
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression_mt
This system has 8 processors configured and 8 processors available.
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 2370 milli seconds