OpenMP implementation slower than serial implementation

Question

i am currently trying to get familiar with OpenMP. For practice i implemented a greedy "learning" algorithm with OpenMP. Then i measured the time with

time ./a.out

I compared with my serial implementation and no matter how many iterations my program is doing the OpenMP one is alway significant slower.

Here is my Code, comments should hopefully explain everything:

#include <omp.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <stdio.h>
#include <ctime>

#define THREADS 4

using namespace std;

struct TrainData {
    double input;
    double output;
};

//Long Term Memory struct 
struct LTM {
        double a; //paramter a of the polynom
        double b;
        double c;
        double score; //score to be minimized!

        LTM()
        {
            a=0;
            b=0;
            c=0;
            score=0;
        }

        //random LTM with paramters from low to high (including low and high)
        LTM(int low, int high)
        {
            score=0;
            a= rand() % high + low;
            b= rand() % high + low;
            c= rand() % high + low; 

        }

        LTM(double _a, double _b, double _c)
        {
            a=_a;
            b=_b;
            c=_c;
        }

        void print()
        {
            cout<<"Score: "<<score<<endl;
            cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl;
        }
};

//the acutal polynom function evaluating with passed LTM
inline double evaluate(LTM &ltm, const double &x)
{
    double ret;
    ret = ltm.a*x*x + ltm.b*x + ltm.c;

    return ret; 
}


//scoring function calculates the Root Mean Square error (RMS)
inline double score_function(LTM &ltmnew, vector<TrainData> &td)
{
    double score;
    double val;
    int tdsize=td.size();
    score=0;

    for(int i=0; i< tdsize; i++)
    {
        val = (td.at(i)).output -  evaluate(ltmnew, (td.at(i)).input);
        val *=  val;
        score += val;
    }

    score /= (double)tdsize;

    score = sqrt(score);

    return score;
}

LTM iterate(int iterations, vector<TrainData> td, int low, int high)
{
    LTM fav = LTM(low,high);
    fav.score = score_function(fav, td);
    fav.print();
    LTM favs[THREADS]; // array for collecting the favorites of each thread

    #pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td)
    {
        #pragma omp master
        printf("Threads: %d\n", omp_get_num_threads());

        LTM cand;
        #pragma omp for private(cand)
        for(int i=0; i<iterations; i++)
        {
            cand = LTM(low, high);
            cand.score = score_function(cand, td);

            if(cand.score < fav.score)
                fav = cand;
        }

        //save the favorite before ending the parallel section
        #pragma omp critical
        favs[omp_get_thread_num()] = fav;
    }

    //search for the best one in the array
    for(int i=0; i<THREADS; i++)
    {
        if(favs[i].score < fav.score)
            fav=favs[i];
    }

    return fav;
}

//generate training data from -50 up to 50 with the train LTM
void generateTrainData(vector<TrainData> *td, LTM train)
{
    #pragma omp parallel for schedule(dynamic, 25) 
    for(int i=-50; i< 50; i++)
    {
        struct TrainData d;
        d.input = i;
        d.output = evaluate(train, (double)i);
        #pragma omp critical
        td->push_back(d);

        //cout<<"input: "<<d.input<<" -> "<<d.output<<endl;
    }

}

int main(int argc, char *argv[])
{

    int its= 10000000; //number of iterations 
    int a=2;
    int b=4;
    int c=6;

    srand(time(NULL));
    LTM pol = LTM(a,b,c); //original polynom parameters
    vector<TrainData> td;

    //first genarte some training data and save it to td
    generateTrainData(&td, pol); 

    //try to find the best solution
    LTM fav = iterate( its, td, 1, 6);


    printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score);

    return 0;
}

At my home PC it took 12s with this implementation. The serial one only 6s. If i increase the number of iterations by factor 10 it will be around 2min/1min (omp / serial).

Can anyone help me?

`rand()` is not usable in concurrent contexts because it uses a global state. Use `drand48_r()` instead. — Hristo Iliev, May 19 '17 at 11:26

score 1 · Accepted Answer · answered May 19 '17 at 12:08

1

Okay, thanks to the comments of my initial question i could solve the performance issues.

Like in the comments said the problem was the rand() function i was using. I replaced them with an appropriate thread safe drand48_r().

Like:

...
LTM(double low, double high, struct drand48_data *buff)
{
    score=0;
    double x;
    drand48_r(buff,&x);
    a= low + x * (high - low);
    drand48_r(buff,&x);
    b= low + x * (high - low);
    drand48_r(buff,&x);
    c= low + x * (high - low);

}
...

now i got times under one second! Thanks! :)

answered May 19 '17 at 12:08

Talto

85
8

1

You could also use the [C++11 random generators](http://en.cppreference.com/w/cpp/numeric/random), which have a nicer API, provide more choices for the PRNG algorithm, and can adapt the output into different statistical distributions. Just declare your random engine variable within the `#pragma omp parallel` block so it's local to each thread. – Wyzard May 19 '17 at 12:50

OpenMP implementation slower than serial implementation

1 Answers1