1

Have some problems with assigning parallel algorithm to prefix sum issue. I am using openMP for parallel implementation. I have the code in c as below.

Result showing:

seqsum[6] = 28 != parallelsum[6] = 34

Please advise. Thanks.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "omp.h"
#include <string.h>
#define N 10 //33554432 // 2 ^ 25
#define NUM_THREADS 4 

void computeparallelprefix(int *iplist, int *_pprefixsum, unsigned long size)
{
  int nthr, *z, *x = _pprefixsum;
  int i, j, tid, work, lo, hi;
#pragma omp parallel shared(nthr,x,z) private(i,j,tid,work,lo,hi)
  {
    int prev_sum;
    memcpy((void *)x, (void *)iplist, sizeof(int)*size);

    // Assume nthr = 2^k
#pragma omp single
    {
      nthr = omp_get_num_threads();
      z = malloc(sizeof(int)*nthr);
    }   
    tid = omp_get_thread_num();
    work = size /nthr + (i = tid < size%nthr ? 1 : 0);
    lo = (size/nthr)*tid + (i==1 ? tid : size%nthr);
    hi = lo + work;
    if (hi > size)
      hi = size;

    // local prefix sum over x
    for(i=lo+1; i<hi; i++)
      x[i] += x[i-1];    

    // local prefix sum for tid
    z[tid] = x[hi-1];
#pragma omp barrier


    // global prefix sum over z
    for(j=1; j<nthr; j=2*j) {
      if (tid >= j)
        z[tid] = z[tid] + z[tid-j];
#pragma omp barrier
    } 

    // Update local prefix sum x
    prev_sum = z[tid] - x[hi-1];
    for(i=lo;i<hi;i++)
      x[i] += prev_sum;
  }
  free(z);
}

void initlist(int *iplist, unsigned long size)
{
  int i;
  for ( i = 0; i < size; i++)
    iplist[i] = i+1;
  //     iplist[i] = rand() % 13;
}

void printlist(int *list, unsigned long size)
{
  int i;
  for(i = 0; i < size; i++) {
    printf("%d ", list[i]);
  }
  printf("\n");
}

void computeseqprefixsum(int *iplist, int *seqprefixsum, unsigned long size)
{
  int i;
  seqprefixsum[0] = iplist[0];
  for(i = 1; i < size; i++) {
    seqprefixsum[i] = seqprefixsum[i-1] + iplist[i];
  }
}

void checkresults(int *seqsum, int *parallelsum, unsigned long size)
{
  int i;
  for(i = 0; i < size; i++)
  {
    if(seqsum[i] != parallelsum[i]) {
      printf("seqsum[%d] = %d != parallelsum[%d] = %d\n", i, seqsum[i], i,
          parallelsum[i]);
      exit(1);
    }
  }
}

int main(int argc, char *argv[])
{
  // seed the rand generator
  srand(time(NULL));
  double seqstart, seqend, parstart, parend, seqtime, partime;

  // initialize list

  int *iplist, *seqprefixsum, *pprefixsum ;

  iplist = (int*) malloc(sizeof(int) * N);
  seqprefixsum = (int*) malloc(sizeof(int) * N);
  pprefixsum = (int*) malloc(sizeof(int) * N);

  if(iplist == NULL || seqprefixsum == NULL || pprefixsum == NULL) {
    printf("memory cannot be allocated\n");
    exit(1);
  }

  initlist(iplist, N);

  seqstart = omp_get_wtime();

  computeseqprefixsum(iplist, seqprefixsum, N);

  seqend = omp_get_wtime();

  seqtime = seqend - seqstart;

  omp_set_num_threads(NUM_THREADS);

  parstart = omp_get_wtime();

  computeparallelprefix(iplist, pprefixsum, N);

  parend= omp_get_wtime();

  partime = parend - parstart;

  checkresults(seqprefixsum, pprefixsum, N);

  printf("Seq Time : %f, Par Time : %f, Speedup : %f\n", seqtime, partime,
      seqtime/partime);

  free(iplist); free(seqprefixsum); free(pprefixsum);

  return 0;
}
Mike Laren
  • 8,028
  • 17
  • 51
  • 70
Orangeblue
  • 229
  • 1
  • 5
  • 15
  • due to the data dependency in the calculation , I think it the problem is hard to parallelize from the start – dvhh Mar 23 '15 at 00:57
  • How can I improve it? Thanks – Orangeblue Mar 23 '15 at 02:35
  • you really cannot parallelize your computeseqprefixsum because each value is depending on the previous one – dvhh Mar 23 '15 at 02:45
  • @dvhh, it's not too difficult to parallelize in two passes. The first pass you do partial sums for each thread and in the second pass you correct with an offset for each thread. The problem is not getting a parallel algorithm it's that the operation is memory bandwidth bound (like many operations) and so it does not scale with with the number of physical cores. – Z boson Mar 23 '15 at 08:10
  • @dvhh, in fact that's exactly what the OP is doing: two passes, first is partial, second is a correction. Here's a working example https://stackoverflow.com/questions/18719257/parallel-cumulative-prefix-sums-in-openmp-communicating-values-between-thread. I'm not sure why the OPs code is not working but it's the right idea. – Z boson Mar 23 '15 at 08:30

1 Answers1

0

You have the right idea for the prefix sum with your code.

I'm not sure exactly why you don't get the correct result but I cleaned up your code and my version gets the correct result. See the following question for more details parallel-cumulative-prefix-sums-in-openmp-communicating-values-between-thread

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "omp.h"
#define N 11 //33554432 // 2 ^ 25

void computeparallelprefix(int *iplist, int *_pprefixsum, unsigned long size)
{
  int nthr, *z, *x = _pprefixsum;
  #pragma omp parallel
  {
    int i;
    #pragma omp single
    {
      nthr = omp_get_num_threads();
      z = malloc(sizeof(int)*nthr+1);
      z[0] = 0;
    }
    int tid = omp_get_thread_num();
    int sum = 0;
    #pragma omp for schedule(static) 
    for(i=0; i<size; i++) {
      sum += iplist[i];
      x[i] = sum;
    }
    z[tid+1] = sum;
    #pragma omp barrier

    int offset = 0;
    for(i=0; i<(tid+1); i++) {
        offset += z[i];
    }

    #pragma omp for schedule(static)
    for(i=0; i<size; i++) {
      x[i] += offset;
    }
  }
  free(z);
}

int main(void ) {

  int *iplist, *pprefixsum ;

  iplist = (int*) malloc(sizeof(int) * N);
  pprefixsum = (int*) malloc(sizeof(int) * N);

  for(int i=0; i<N; i++) iplist[i] = i+1;
  for(int i=0; i<N; i++) printf("%d ", iplist[i]); printf("\n");

  computeparallelprefix(iplist, pprefixsum, N);
  for(int i=0; i<N; i++) printf("%d ", pprefixsum[i]); printf("\n");
  for(int i=0; i<N; i++) printf("%d ", (i+1)*(i+2)/2); printf("\n");
  return 0;
}
Z boson
  • 32,619
  • 11
  • 123
  • 226
  • Thanks for that! However, it still not works at my end. All progma commands are not functional at all: error: invalid preprocessing directive #progma – Orangeblue Mar 24 '15 at 17:58
  • @Orangeblue, what are you talking about? There is no `#progma` in my code. I just compiled with `-Wall` and I get no warnings. I also grepped for `progma` which returned nothing. This code is fine. I compiled with `gcc -std=gnu99 -fopenmp -O3 -Wall psum.c` – Z boson Mar 25 '15 at 07:26
  • It seems one thread was allowed to malloc "z", but every thread to free the memory "z". – Orangeblue Mar 26 '15 at 18:56
  • @Zboson: Orangeblue has [suggested an edit](http://stackoverflow.com/review/suggested-edits/7482112) to change all the `#pragma` to `#progma`. – Nisse Engström Mar 26 '15 at 19:24
  • @NisseEngström, his edit should be rejected. What is `#progma`? – Z boson Mar 27 '15 at 07:45
  • @NisseEngström, his own quesiton uses `#pragma`. – Z boson Mar 27 '15 at 07:47