I am using pthreads with gcc. The simple code example takes the number of threads "N" as a user-supplied input. It splits up a long array into N roughly equally sized subblocks. Each subblock is written into by individual threads.
The dummy processing for this example really involves sleeping for a fixed amount of time for each array index and then writing a number into that array location.
Here's the code:
/******************************************************************************
* FILE: threaded_subblocks_processing
* DESCRIPTION:
* We have a bunch of parallel processing to do and store the results in a
* large array. Let's try to use threads to speed it up.
******************************************************************************/
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#define BIG_ARR_LEN 10000
typedef struct thread_data{
int start_idx;
int end_idx;
int id;
} thread_data_t;
int big_result_array[BIG_ARR_LEN] = {0};
void* process_sub_block(void *td)
{
struct thread_data *current_thread_data = (struct thread_data*)td;
printf("[%d] Hello World! It's me, thread #%d!\n", current_thread_data->id, current_thread_data->id);
printf("[%d] I'm supposed to work on indexes %d through %d.\n", current_thread_data->id,
current_thread_data->start_idx,
current_thread_data->end_idx-1);
for(int i=current_thread_data->start_idx; i<current_thread_data->end_idx; i++)
{
int retval = usleep(1000.0*1000.0*10.0/BIG_ARR_LEN);
if(retval)
{
printf("sleep failed");
}
big_result_array[i] = i;
}
printf("[%d] Thread #%d done, over and out!\n", current_thread_data->id, current_thread_data->id);
pthread_exit(NULL);
}
int main(int argc, char *argv[])
{
if (argc!=2)
{
printf("usage: ./a.out number_of_threads\n");
return(1);
}
int NUM_THREADS = atoi(argv[1]);
if (NUM_THREADS<1)
{
printf("usage: ./a.out number_of_threads (where number_of_threads is at least 1)\n");
return(1);
}
pthread_t *threads = malloc(sizeof(pthread_t)*NUM_THREADS);
thread_data_t *thread_data_array = malloc(sizeof(thread_data_t)*NUM_THREADS);
int block_size = BIG_ARR_LEN/NUM_THREADS;
for(int i=0; i<NUM_THREADS-1; i++)
{
thread_data_array[i].start_idx = i*block_size;
thread_data_array[i].end_idx = (i+1)*block_size;
thread_data_array[i].id = i;
}
thread_data_array[NUM_THREADS-1].start_idx = (NUM_THREADS-1)*block_size;
thread_data_array[NUM_THREADS-1].end_idx = BIG_ARR_LEN;
thread_data_array[NUM_THREADS-1].id = NUM_THREADS;
int ret_code;
long t;
for(t=0;t<NUM_THREADS;t++){
printf("[main] Creating thread %ld\n", t);
ret_code = pthread_create(&threads[t], NULL, process_sub_block, (void *)&thread_data_array[t]);
if (ret_code){
printf("[main] ERROR; return code from pthread_create() is %d\n", ret_code);
exit(-1);
}
}
printf("[main] Joining threads to wait for them.\n");
void* status;
for(int i=0; i<NUM_THREADS; i++)
{
pthread_join(threads[i], &status);
}
pthread_exit(NULL);
}
and I compile it with
gcc -pthread threaded_subblock_processing.c
and then I call it from command line like so:
$ time ./a.out 4
I see a speed up when I increase the number of threads. With 1 thread the process takes just a little over 10 seconds. This makes sense because I sleep for 1000 usec per array element, and there are 10,000 array elements. Next when I go to 2 threads, it goes down to a little over 5 seconds, and so on.
What I don't understand is that I get a speed-up even after my number of threads exceeds the number of cores on my computer! I have 4 cores, so I was expecting no speed-up for >4 threads. But, surprisingly, when I run
$ time ./a.out 100
I get a 100x speedup and the processing completes in ~0.1 seconds! How is this possible?