I'm writing a program to sum N numbers in an array. To do this, the program may split its execution in a number of threads, each of them summing N/numThreads positions of the array.
However, the fastest execution time I've got so far was using 1 thread. Using 4 threads may take as much as twice the time! I've thought a lot about this problem (including the possibility of false sharing), but I just can't think of a solution. I really can't see where the false sharing is occurring, if at all.
I'm running Linux Mint and compiling using GCC by typing gcc spinlock.c -o spinlock -lpthread -lm My code is below. Please feel free to compile it and play around with it. You may change the value of N (number of elements in array) by changing the Define in the program and you may pass any number of threads as arguments for the program:
#define _GNU_SOURCE
#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <sys/types.h>
#include <inttypes.h>
#include <sched.h>
#include <math.h>
#include <time.h>
#define N 10000000
long sumTotal = 0; /* Global sum variable */
int lock = 0; /* Lock for sum variable access */
struct threadInfo { /* Argument passed to the function the thread will execute */
pthread_t threadId; /* Returned ID by pthread_create() */
int threadNum; /* Thread number (from 1 to threadNum) */
unsigned long sectionSize; /* Section size to operate */
int8_t* section; /* Pointer to start of the array to be summed */
};
void acquire(int* _lock) {
while (__sync_lock_test_and_set(_lock, 1));
}
void release(int* _lock) {
*_lock = 0;
}
void sumNum(void* arg){
struct threadInfo *tinfo = arg;
long i, sum = 0;
for (i=0; i < tinfo->sectionSize; i++) {
sum += tinfo->section[i];
}
acquire(&lock);
sumTotal += sum;
release(&lock);
}
int main(int argc, char** argv)
{
int8_t* numbers;
unsigned long sectionSize, currNum;
unsigned int currThr, numThreads;
int err, num_cores, i, core_interval, t1, t2;
double arithmetics;
pthread_attr_t attr;
struct threadInfo *tinfo;
/* Invalid number of arguments: */
if (argc != 2){
fprintf(stderr, "Usage: %s [threads]\n", argv[0]);
exit(-1);
}
/* Converts argv to int: (K) */
numThreads = (int) strtol(argv[1], (char**) NULL, 10);
sectionSize = N/numThreads;
/* Randoms the seed: */
srand(time(NULL));
/* Fills array with numbers: */
numbers = (int8_t*) malloc(N);
for (currNum=0; currNum<N; currNum++){
numbers[currNum] = (int8_t) (((rand())%199)-99);
}
/* Initializes struct containing default attributes of threads: */
err = pthread_attr_init(&attr);
if (err != 0){
fprintf(stderr, "Failed.\n");
exit(-2);
}
/* Allocates memory for pthread_create arguments: */
tinfo = calloc(numThreads, sizeof(struct threadInfo));
if (tinfo == NULL){
fprintf(stderr, "Failed.\n");
exit(-3);
}
/* Thread affinity operations: */
num_cores = sysconf(_SC_NPROCESSORS_ONLN);
cpu_set_t cpuset[num_cores];
for (i=0; i<num_cores; i++){
CPU_ZERO(&(cpuset[i]));
CPU_SET(i+1, &(cpuset[i]));
}
core_interval = N/num_cores;
t1 = clock();
/* Initialize threads: */
for (currThr=0; currThr<numThreads; currThr++){
tinfo[currThr].threadNum = currThr + 1;
tinfo[currThr].section = &(*(numbers+currThr*sectionSize));
/* If this is the last thread, takes the remainder of the array: */
if (currThr+1 == numThreads) {
tinfo[currThr].sectionSize = N - (currThr*sectionSize);
/* Other threads: */
} else {
tinfo[currThr].sectionSize = sectionSize;
}
/* Calculates to which core a thread should go: */
arithmetics = (double) (currThr+1)*num_cores/numThreads;
i = ceil(arithmetics);
tinfo[currThr].threadId = pthread_self();
/* This line, if uncommented, sets a group of threads to a specific cpu (core): */
//pthread_setaffinity_np(tinfo[currThr].threadId, sizeof(cpu_set_t), &cpuset[i-1]);
/* Creates the thread: */
err = pthread_create(&tinfo[currThr].threadId,
&attr,
(void*) &sumNum,
&tinfo[currThr]);
if (err != 0){
fprintf(stderr, "Failed to create thread.\n");
exit(-4);
}
}
void* ret = NULL;
/* Joins threads: */
for (currThr=0; currThr<numThreads; currThr++) {
pthread_join(tinfo[currThr].threadId, &ret);
}
t2 = clock();
printf("Total sum: %ld. Total time: %f\n", sumTotal, (t2-t1*1.0)/CLOCKS_PER_SEC);
return 0;
}
Thank you for your time, any help is appreciated!