i am currently trying to get familiar with OpenMP. For practice i implemented a greedy "learning" algorithm with OpenMP. Then i measured the time with
time ./a.out
I compared with my serial implementation and no matter how many iterations my program is doing the OpenMP one is alway significant slower.
Here is my Code, comments should hopefully explain everything:
#include <omp.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <stdio.h>
#include <ctime>
#define THREADS 4
using namespace std;
struct TrainData {
double input;
double output;
};
//Long Term Memory struct
struct LTM {
double a; //paramter a of the polynom
double b;
double c;
double score; //score to be minimized!
LTM()
{
a=0;
b=0;
c=0;
score=0;
}
//random LTM with paramters from low to high (including low and high)
LTM(int low, int high)
{
score=0;
a= rand() % high + low;
b= rand() % high + low;
c= rand() % high + low;
}
LTM(double _a, double _b, double _c)
{
a=_a;
b=_b;
c=_c;
}
void print()
{
cout<<"Score: "<<score<<endl;
cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl;
}
};
//the acutal polynom function evaluating with passed LTM
inline double evaluate(LTM <m, const double &x)
{
double ret;
ret = ltm.a*x*x + ltm.b*x + ltm.c;
return ret;
}
//scoring function calculates the Root Mean Square error (RMS)
inline double score_function(LTM <mnew, vector<TrainData> &td)
{
double score;
double val;
int tdsize=td.size();
score=0;
for(int i=0; i< tdsize; i++)
{
val = (td.at(i)).output - evaluate(ltmnew, (td.at(i)).input);
val *= val;
score += val;
}
score /= (double)tdsize;
score = sqrt(score);
return score;
}
LTM iterate(int iterations, vector<TrainData> td, int low, int high)
{
LTM fav = LTM(low,high);
fav.score = score_function(fav, td);
fav.print();
LTM favs[THREADS]; // array for collecting the favorites of each thread
#pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td)
{
#pragma omp master
printf("Threads: %d\n", omp_get_num_threads());
LTM cand;
#pragma omp for private(cand)
for(int i=0; i<iterations; i++)
{
cand = LTM(low, high);
cand.score = score_function(cand, td);
if(cand.score < fav.score)
fav = cand;
}
//save the favorite before ending the parallel section
#pragma omp critical
favs[omp_get_thread_num()] = fav;
}
//search for the best one in the array
for(int i=0; i<THREADS; i++)
{
if(favs[i].score < fav.score)
fav=favs[i];
}
return fav;
}
//generate training data from -50 up to 50 with the train LTM
void generateTrainData(vector<TrainData> *td, LTM train)
{
#pragma omp parallel for schedule(dynamic, 25)
for(int i=-50; i< 50; i++)
{
struct TrainData d;
d.input = i;
d.output = evaluate(train, (double)i);
#pragma omp critical
td->push_back(d);
//cout<<"input: "<<d.input<<" -> "<<d.output<<endl;
}
}
int main(int argc, char *argv[])
{
int its= 10000000; //number of iterations
int a=2;
int b=4;
int c=6;
srand(time(NULL));
LTM pol = LTM(a,b,c); //original polynom parameters
vector<TrainData> td;
//first genarte some training data and save it to td
generateTrainData(&td, pol);
//try to find the best solution
LTM fav = iterate( its, td, 1, 6);
printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score);
return 0;
}
At my home PC it took 12s with this implementation. The serial one only 6s. If i increase the number of iterations by factor 10 it will be around 2min/1min (omp / serial).
Can anyone help me?