I'm pretty new to C, so I'm not sure where even to start digging about my problem. I'm trying to port python number-crunching algos to C, and since there's no GIL in C (woohoo), I can change whatever I want in memory from threads, for as long as I make sure there are no races.
I did my homework on mutexes, however, I cannot wrap my head around use of mutexes in case of continuously running threads accessing the same array over and over.
I'm using p_threads in order to split the workload on a big array a[N]
.
Number crunching algorithm on array a[N]
is additive, so I'm splitting it using a_diff[N_THREADS][N]
array, writing changes to be applied to a[N]
array from each thread to a_diff[N_THREADS][N]
and then merging them all together after each step.
I need to run the crunching on different versions of array a[N]
, so I pass them via global pointer p
(in the MWE, there's only one a[N]
)
I'm synchronizing threads using another global array SYNC_THREADS[N_THREADS]
and make sure threads quit when I need them to by setting END_THREADS
global (I know, I'm using too many globals - I don't care, code is ~200 lines). My question is in regard to this synchronization technique - is it safe to do so and what is cleaner/better/faster way to achieve that?
MWEe:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define N_THREADS 3
#define N 10000000
#define STEPS 3
double a[N]; // main array
double a_diff[N_THREADS][N]; // diffs array
double params[N]; // parameter used for number-crunching
double (*p)[N]; // pointer to array[N]
// structure for bounds for crunching the array
struct bounds {
int lo;
int hi;
int thread_num;
};
struct bounds B[N_THREADS];
int SYNC_THREADS[N_THREADS]; // for syncing threads
int END_THREADS = 0; // signal to terminate threads
static void *crunching(void *arg) {
// multiple threads run number-crunching operations according to assigned low/high bounds
struct bounds *data = (struct bounds *)arg;
int lo = (*data).lo;
int hi = (*data).hi;
int thread_num = (*data).thread_num;
printf("worker %d started for bounds [%d %d] \n", thread_num, lo, hi);
int i;
while (END_THREADS != 1) { // END_THREADS tells threads to terminate
if (SYNC_THREADS[thread_num] == 1) { // SYNC_THREADS allows threads to start number-crunching
printf("worker %d working... \n", thread_num );
for (i = lo; i <= hi; ++i) {
a_diff[thread_num][i] += (*p)[i] * params[i]; // pretend this is an expensive operation...
}
SYNC_THREADS[thread_num] = 0; // thread disables itself until SYNC_THREADS is back to 1
printf("worker %d stopped... \n", thread_num );
}
}
return 0;
}
int i, j, th,s;
double joiner;
int main() {
// pre-fill arrays
for (i = 0; i < N; ++i) {
a[i] = i + 0.5;
params[i] = 0.0;
}
// split workload between workers
int worker_length = N / N_THREADS;
for (i = 0; i < N_THREADS; ++i) {
B[i].thread_num = i;
B[i].lo = i * worker_length;
if (i == N_THREADS - 1) {
B[i].hi = N;
} else {
B[i].hi = i * worker_length + worker_length - 1;
}
}
// pointer to parameters to be passed to worker
struct bounds **data = malloc(N_THREADS * sizeof(struct bounds*));
for (i = 0; i < N_THREADS; i++) {
data[i] = malloc(sizeof(struct bounds));
data[i]->lo = B[i].lo;
data[i]->hi = B[i].hi;
data[i]->thread_num = B[i].thread_num;
}
// create thread objects
pthread_t threads[N_THREADS];
// disallow threads to crunch numbers
for (th = 0; th < N_THREADS; ++th) {
SYNC_THREADS[th] = 0;
}
// launch workers
for(th = 0; th < N_THREADS; th++) {
pthread_create(&threads[th], NULL, crunching, data[th]);
}
// big loop of iterations
for (s = 0; s < STEPS; ++s) {
for (i = 0; i < N; ++i) {
params[i] += 1.0; // adjust parameters
// zero diff array
for (i = 0; i < N; ++i) {
for (th = 0; th < N_THREADS; ++th) {
a_diff[th][i] = 0.0;
}
}
p = &a; // pointer to array a
// allow threads to process numbers and wait for threads to complete
for (th = 0; th < N_THREADS; ++th) { SYNC_THREADS[th] = 1; }
// ...here threads started by pthread_create do calculations...
for (th = 0; th < N_THREADS; th++) { while (SYNC_THREADS[th] != 0) {} }
// join results from threads (number-crunching is additive)
for (i = 0; i < N; ++i) {
joiner = 0.0;
for (th = 0; th < N_THREADS; ++th) {
joiner += a_diff[th][i];
}
a[i] += joiner;
}
}
}
// join workers
END_THREADS = 1;
for(th = 0; th < N_THREADS; th++) {
pthread_join(threads[th], NULL);
}
return 0;
}
I see that workers don't overlap time-wise:
worker 0 started for bounds [0 3333332]
worker 1 started for bounds [3333333 6666665]
worker 2 started for bounds [6666666 10000000]
worker 0 working...
worker 1 working...
worker 2 working...
worker 2 stopped...
worker 0 stopped...
worker 1 stopped...
worker 2 working...
worker 0 working...
worker 1 working...
worker 1 stopped...
worker 0 stopped...
worker 2 stopped...
worker 2 working...
worker 0 working...
worker 1 working...
worker 1 stopped...
worker 2 stopped...
worker 0 stopped...
Process returned 0 (0x0) execution time : 1.505 s
and I make sure worker's don't get into each other workspaces by separating them via a_diff[thead_num][N]
sub-arrays, however, I'm not sure that's always the case and that I'm not introducing hidden races somewhere...