7

I am learning parallel processing using Pthreads. I have a quad core processor. Unfortunately, the parallelized portion of the following code is running roughly 5X slower than the non-parallelized code. What am I doing wrong here? Thanks in advance for the help.

#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#define NTHREADS 4
#define SIZE NTHREADS*10000000

struct params {
  int * arr;
  int sum;
};

/* The worker function for the pthreads */
void * myFun (void * x){
  int i;
  struct params * b = (struct params *) x;
  for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
    b->sum += b->arr[i];
  }
  return NULL;
}

/* unparallelized summing function*/
int arrSum(int * arr, int size){
  int sum = 0;
  for (int i = 0; i != size; ++i){
    sum += arr[i];
  }
  return sum;
}

int main(int argc, char * argv[]){
  clock_t begin, end;
  double runTime;
  int rc, i;
  int sum1, sum2 = 0;
  pthread_t threads[NTHREADS];

  /* create array to sum over */
  int * myArr = NULL;
  myArr = (int *) calloc(SIZE, sizeof(int));
  if (myArr == NULL){
    printf("problem allocating memory\n");
    return 1; 
  }
  for (int i = 0; i < SIZE; ++i){
    myArr[i] = 1;
  }

  /* create array of params structs to feed to threads */
  struct params p;
  p.sum = 0;
  struct params inputs[NTHREADS];
  for(i = 0; i != NTHREADS; ++i){
    p.arr = myArr + i*(int)(SIZE/NTHREADS);
    inputs[i] = p;
  }

  /* spawn the threads */
  begin = clock();
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
  }

  /* wait for threads to finish */
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_join(threads[i], NULL);
  }
  end = clock();
  runTime = (double)(end - begin)/CLOCKS_PER_SEC;
  printf("Parallelized code run time: %f\n", runTime);

  /* run the unparallelized code */
  begin = clock();
  sum2 = arrSum(myArr, SIZE);
  end = clock();
  runTime = (double)(end - begin)/CLOCKS_PER_SEC;
  printf("Unparallelized code run time: %f\n", runTime);

  /* consolidate and print results from threads */
  for(i = 0; i != NTHREADS; ++i){
    sum1 += inputs[i].sum;
  }
  printf("sum1, sum2: %d, %d \n", sum1, sum2);

  free(myArr);

  /* be disappointed when my parallelized code showed no speedup */
  return 1;
}
Z boson
  • 32,619
  • 11
  • 123
  • 226
Thirdeye
  • 89
  • 3
  • 1
    Is there any reason to add the C++ tag? – too honest for this site Jul 05 '15 at 04:14
  • @Olaf the code valid in both C++ and C. Perhaps OP don't care if the answer would be C or C++. – Hi-Angel Jul 05 '15 at 04:17
  • 1
    @Hi-Angel: There are quite some differences between both languages. For instance, in C you should not cast `void *`, while in C++ you have to. In general, you should not write code like this in C++, so I assume it is C. – too honest for this site Jul 05 '15 at 04:18
  • 8
    `b->sum += b->arr[i];` is not the same as the other function because it writes to a memory location on each loop. Try using a local variable to compute the sum and the doing `b->sum = sum` at the end of the function. – JS1 Jul 05 '15 at 04:21
  • 3
    To be *certain* you're comparing apples to apples, use the same function to compute the sums in each case. Nothing prevents you from running your `myFun()` function directly, as many times in a row as you need. – John Bollinger Jul 05 '15 at 04:46
  • Assuming that your machine actually has at least 4 cores available, I suspect you're seeing worse performance due to cache contention among the threads (and if you *don't* have 4 cores, then the problem is you have more threads than cores). Once each thread's working set blows out its core's L2 cache, it's going to start hitting the shared L3 cache, and the contention for that is going to hurt overall performance. – Adam Rosenfield Jul 05 '15 at 06:23
  • 1
    [The problem is you're using `clock`](http://stackoverflow.com/questions/10874214/measure-execution-time-in-c-openmp-code/10874371#10874371). – Z boson Jul 06 '15 at 11:41

2 Answers2

2

You're missing one important aspect of parallel programming.

The worker threads need to be created once per process, not for every task.

Creating and destroying threads takes time.

The solution is to use a thread pool and send tasks to the pool.

My suggestion is to use OpenMP which simplifies this task considerably and works with many compilers.

Example:

int sum = 0
#pragma omp for shared(sum)
 for(int i=0; i<SIZE; ++i)
 {
   #pragma omp atomic
   sum += myArr[i]
 }

To make this work faster, do some loop unrolling - e.g. calculate the sum of 8 number in a single for loop scope.

egur
  • 7,830
  • 2
  • 27
  • 47
  • thanks for the response. However I am unsure what you mean by "threads need to be created once per process, not once for every task". It's my understanding that in order to have a chance of having all four of my cores simultaneously working on summing parts of the array, I would need to have 4 threads running. Could you please elaborate a little more? Thanks. – Thirdeye Jul 05 '15 at 06:44
  • 1
    Create the 4 threads once at the start of the program. Creating threads take time. The created threads wait for a task (e.g. via mutex). After the threads receive the task, they wait for another task. This is called a thread pool. What your does is measure how much time it takes to create AND destroy threads which is a constant time regardless of the load. If you need to run more tasks, you should reuse the existing threads. – egur Jul 05 '15 at 06:52
  • I understand now. Thanks. – Thirdeye Jul 05 '15 at 15:10
  • This has nothing to do with a thread pool. OpenMP has to create the threads the first time it's called as well. A thread pool is useful if you make another parallel call but the OP does not do this. – Z boson Jul 06 '15 at 11:23
  • That's incorrect. A thread pool is useful when you want to recycle threads. e.g. when running a second set of tasks after the first set has ended. – egur Jul 06 '15 at 11:27
  • @egur, exactly! That's what I said (or meant). The OP does not make a second set of tasks so a thread pool is useless. – Z boson Jul 06 '15 at 11:39
  • The OP wants to measure multithreaded performance. To do so correctly, the threads need to be created beforehand. – egur Jul 06 '15 at 12:21
  • @egur, if the OP used OpenMP the threads would have to be created as well. If the OP timed his parallel code with OpenMP it would have this same overhead. Unless he called OpenMP, ignored the results, and then timed it again after the pool had be created. But in any case you have missed the major problem with the OPs code. He used `clock()`. That's the main problem. Not the overhead from thread creation, that's a minor issue. – Z boson Jul 06 '15 at 12:25
  • Your example for doing a reduction is also poor. It's inefficient to use an atomic operations every iterations. This is not even a good way to manually do a reduction with OpenMP. You should have used `#pragma omp parallel for reduction(+:sum)` but if you want to do it manually then you should create private version of `sum` for each thread and then only use atomic per thread and not per iteration. – Z boson Jul 08 '15 at 07:26
2

The main problem is that you're using clock() which does not return the wall time but the cumulative CPU time. This is the most common mistake with the OpenMP tag with SO (and if the frequency listing was useful on SO it should show this).

The simplest way to get the wall time is to use a function from OpenMP: omp_get_wtime(). This works Linux and Windows with GCC, ICC, and MSVC (and I assume Clang which now supports OpenMP 3.1).

When I use this with your code I get on my four core/eight hyper-thread i7 IVB system:

Parallelized code run time: 0.048492
Unparallelized code run time: 0.115124
sum1, sum2: 400000000, 400000000

Some other comments. Your scheduling is error prone. You set the array for each thread to

p.arr = myArr + i*(int)(SIZE/NTHREADS);

And then have each thread run over (SIZE/NTHREADS). This can give wrong results to to rounding errors for some values of SIZE and NTHREADS.

You should have each thread run over

int start = ithread*SIZE/NTHREADS;
int finish = (ithreads+1)*SIZE/NTHREADS;

And then have each thread point to the beginning of the array and do

int sum = 0;
for (i = start; i < finish; ++i){
    sum += b->arr[i];
}

This is essentially what OpenMP's schedule(static) does. In fact you can get the same effect of pthreads using OpenMP by doing

int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; ++i){
    sum += arr[i];
}

Here is the code I used

//gcc -O3 -std=gnu99 t.c -lpthread -fopenmp
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <omp.h>

#define NTHREADS 4
#define SIZE NTHREADS*100000000

struct params {
  int * arr;
  int sum;
};

/* The worker function for the pthreads */
void * myFun (void * x){
  int i;
  struct params * b = (struct params *) x;
  int sum = 0;
  for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
    sum += b->arr[i];
  }
  b->sum = sum;
  return NULL;
}

/* unparallelized summing function*/
int arrSum(int * arr, int size){
  int sum = 0;
  for (int i = 0; i < size; ++i){
    sum += arr[i];
  }
  return sum;
}

int main(int argc, char * argv[]) {
  double runTime;
  int rc, i;
  int sum1, sum2 = 0;
  pthread_t threads[NTHREADS];

  /* create array to sum over */
  int * myArr = NULL;
  myArr = (int *) calloc(SIZE, sizeof(int));
  if (myArr == NULL){
    printf("problem allocating memory\n");
    return 1; 
  }
  for (int i = 0; i < SIZE; ++i){
    myArr[i] = 1;
  }

  /* create array of params structs to feed to threads */
  struct params p;
  p.sum = 0;
  struct params inputs[NTHREADS];
  for(i = 0; i < NTHREADS; ++i){
    p.arr = myArr + i*(int)(SIZE/NTHREADS);
    inputs[i] = p;
  }

  /* spawn the threads */
  runTime = -omp_get_wtime();  
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
  }

  /* wait for threads to finish */
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_join(threads[i], NULL);
  }

  runTime += omp_get_wtime();  
  printf("Parallelized code run time: %f\n", runTime);

  /* run the unparallelized code */
  runTime = -omp_get_wtime();
  sum2 = arrSum(myArr, SIZE);
  runTime += omp_get_wtime();
  printf("Unparallelized code run time: %f\n", runTime);

  /* consolidate and print results from threads */
  for(i = 0; i != NTHREADS; ++i){
    sum1 += inputs[i].sum;
  }
  printf("sum1, sum2: %d, %d \n", sum1, sum2);

  free(myArr);

  /* be disappointed when my parallelized code showed no speedup */
  return 1;
}
Community
  • 1
  • 1
Z boson
  • 32,619
  • 11
  • 123
  • 226