-1

I'm following this PI Calculation example in C, to compare the execution time of the serial and parallel version. I used gettimeofday() to measure the execution time. But the execution time is about the same. Anything wrong with my code? or the method of measuring the time?

My code is as follows:

serial version

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

int main()
{

struct timeval tvalBefore, tvalAfter;
gettimeofday(&tvalBefore, NULL);

#define sqr(x) ((x)*(x))
long random(void);
double x_coord, y_coord, pi, r;
int score, n;
unsigned int cconst;
int darts = 5000000;

if (sizeof(cconst) != 4) {
    printf("Wrong data size for cconst variable!\nQuitting.\n");
    exit(1);
}

cconst = 2 << (31 - 1);
score = 0;

for (n = 1; n <= darts; n++) {
    r = (double)random() / cconst;
    x_coord = (2.0 * r) - 1.0;
    r = (double)random() / cconst;
    y_coord = (2.0 * r) - 1.0;

    if ((sqr(x_coord) + sqr(y_coord)) <= 1.0)
        score++;
}

pi = 4.0 * (double)score / (double)darts;

gettimeofday(&tvalAfter, NULL);
long tm = (tvalAfter.tv_sec - tvalBefore.tv_sec) * 1000000L + tvalAfter.tv_usec - tvalBefore.tv_usec;

printf("PI = %lf\nSerial execution time: %ld microseconds\n", pi, tm);
return 0;

}

parallel version

/**********************************************************************
 * FILE: mpi_pi_reduce.c
 * OTHER FILES: dboard.c
 * DESCRIPTION:  
 *   MPI pi Calculation Example - C Version 
 *   Collective Communication example:  
 *   This program calculates pi using a "dartboard" algorithm.  See
 *   Fox et al.(1988) Solving Problems on Concurrent Processors, vol.1
 *   page 207.  All processes contribute to the calculation, with the
 *   master averaging the values for pi. This version uses mpc_reduce to 
 *   collect results
 * AUTHOR: Blaise Barney. Adapted from Ros Leibensperger, Cornell Theory
 *   Center. Converted to MPI: George L. Gusciora, MHPCC (1/95) 
 * LAST REVISED: 06/13/13 Blaise Barney
**********************************************************************/
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void srandom (unsigned seed);
double dboard (int darts);
#define DARTS 50000     /* number of throws at dartboard */
#define ROUNDS 100      /* number of times "darts" is iterated */
#define MASTER 0        /* task ID of master task */

int main (int argc, char *argv[])
{
struct timeval tvalBefore, tvalAfter;
gettimeofday(&tvalBefore, NULL);

double  homepi,         /* value of pi calculated by current task */
        pisum,          /* sum of tasks' pi values */
        pi,             /* average of pi after "darts" is thrown */
        avepi;          /* average pi value for all iterations */
int taskid,         /* task ID - also used as seed number */
    numtasks,       /* number of tasks */
    rc,             /* return code */
    i;
MPI_Status status;

/* Obtain number of tasks and task ID */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
printf ("MPI task %d has started...\n", taskid);

/* Set seed for random number generator equal to task ID */
srandom (taskid);

avepi = 0;
for (i = 0; i < ROUNDS; i++) {
   /* All tasks calculate pi using dartboard algorithm */
   homepi = dboard(DARTS);

   /* Use MPI_Reduce to sum values of homepi across all tasks 
    * Master will store the accumulated value in pisum 
    * - homepi is the send buffer
    * - pisum is the receive buffer (used by the receiving task only)
    * - the size of the message is sizeof(double)
    * - MASTER is the task that will receive the result of the reduction
    *   operation
    * - MPI_SUM is a pre-defined reduction function (double-precision
    *   floating-point vector addition).  Must be declared extern.
    * - MPI_COMM_WORLD is the group of tasks that will participate.
    */

   rc = MPI_Reduce(&homepi, &pisum, 1, MPI_DOUBLE, MPI_SUM,
                   MASTER, MPI_COMM_WORLD);

   /* Master computes average for this iteration and all iterations */
   if (taskid == MASTER) {
      pi = pisum/numtasks;
      avepi = ((avepi * i) + pi)/(i + 1); 
      //printf("   After %8d throws, average value of pi = %10.8f\n", (DARTS * (i + 1)),avepi);
   }    
} 
if (taskid == MASTER) {
   gettimeofday(&tvalAfter, NULL);
   long tm = (tvalAfter.tv_sec - tvalBefore.tv_sec) * 1000000L + tvalAfter.tv_usec - tvalBefore.tv_usec;
   printf("\nReal value of PI: 3.1415926535897 \n"); 
   printf("Parallel execution time: %ld microseconds\n", tm);
}
MPI_Finalize();

return 0;
}



/**************************************************************************
* subroutine dboard
* DESCRIPTION:
*   Used in pi calculation example codes. 
*   See mpi_pi_send.c and mpi_pi_reduce.c  
*   Throw darts at board.  Done by generating random numbers 
*   between 0 and 1 and converting them to values for x and y 
*   coordinates and then testing to see if they "land" in 
*   the circle."  If so, score is incremented.  After throwing the 
*   specified number of darts, pi is calculated.  The computed value 
*   of pi is returned as the value of this function, dboard. 
*
*   Explanation of constants and variables used in this function:
*   darts       = number of throws at dartboard
*   score       = number of darts that hit circle
*   n           = index variable
*   r           = random number scaled between 0 and 1
*   x_coord     = x coordinate, between -1 and 1
*   x_sqr       = square of x coordinate
*   y_coord     = y coordinate, between -1 and 1
*   y_sqr       = square of y coordinate
*   pi          = computed value of pi
****************************************************************************/

double dboard(int darts)
{
#define sqr(x)  ((x)*(x))
long random(void);
double x_coord, y_coord, pi, r; 
int score, n;
unsigned int cconst;  /* must be 4-bytes in size */
/*************************************************************************
 * The cconst variable must be 4 bytes. We check this and bail if it is
 * not the right size
 ************************************************************************/
if (sizeof(cconst) != 4) {
   printf("Wrong data size for cconst variable in dboard routine!\n");
   printf("See comments in source file. Quitting.\n");
   exit(1);
   }
   /* 2 bit shifted to MAX_RAND later used to scale random number between 0 and 1 */
   cconst = 2 << (31 - 1);
   score = 0;

   /* "throw darts at board" */
   for (n = 1; n <= darts; n++)  {
      /* generate random numbers for x and y coordinates */
      r = (double)random()/cconst;
      x_coord = (2.0 * r) - 1.0;
      r = (double)random()/cconst;
      y_coord = (2.0 * r) - 1.0;

      /* if dart lands in circle, increment score */
      if ((sqr(x_coord) + sqr(y_coord)) <= 1.0)
           score++;
      }

/* calculate pi */
pi = 4.0 * (double)score/(double)darts;
return(pi);
} 

I coded and ran the code on the cluster. I complied and ran the code with

mpicc serial.c -o serial.o
mpicc parallel.c -o parallel.o

mpirun -n 1 serial.o
mpirun -np 4 -pernode parallel.o

The results are:

# serial
PI = 3.142431
Serial execution time: 262699 microseconds

# parallel
MPI task 1 has started...
MPI task 0 has started...
MPI task 3 has started...
MPI task 2 has started...

Real value of PI: 3.1415926535897
Parallel execution time: 294984 microseconds
Zulan
  • 21,896
  • 6
  • 49
  • 109
shilaide
  • 1
  • 1
  • the `random()` function is prototyped in `stdlib.h` so a poor idea to supply a prototype within your code – user3629249 Oct 06 '17 at 06:55
  • the `srandom()` function is prototyped in `stdlib.h` so a poor idea to supply a prototype within your code. – user3629249 Oct 06 '17 at 07:12
  • I would start by asking why I expected the parallel version to be faster. Does parallelizing the computation allow the use of CPU resources that would otherwise be standing idle? Then I'd check whether work is actually being distributed across CPU cores. – Kevin Boone Oct 06 '17 at 07:23
  • @user3629249 I'm not familiar with C atm. Thanks for the advice, I'll modify the code. – shilaide Oct 06 '17 at 07:51
  • you might try calculating `pi` via 355.0 / 113.0 Quick, fast, and accurate to a modest number of decimal places – user3629249 Oct 06 '17 at 07:53
  • suggest replacing `unsigned int cconst;` with `uint32_t cconst;` (from the header file stdint.h) so you do not need to check the size of the variable – user3629249 Oct 06 '17 at 07:55

1 Answers1

2

Where is the parallelization ?

The serial version compute pi in 5,000,000 iterations. In the parallel version, each task is performing 50,000 * 100 iterations too, and then you take the average. So the parallel version might be "statistically more accurate", but not faster.

Also, you do 500 MPI_Reduce() when i think only one is needed. Bottom line, i am even surprised the "parallel" version is not much slower.

If you want to run faster via parallelization, each task should compute 5,000,000 / numtasks iterations starting at 5,000,000 * taskid / numtasks, and then you should issue a single MPI_Reduce().

Gilles Gouaillardet
  • 8,193
  • 11
  • 24
  • 30
  • Right!!! I simply ran the parallel version from the example but didn't notice this. And thanks for the advice about MPI_Reduce(). – shilaide Oct 06 '17 at 07:50