I am new to OpenMP and I have to complete this project where I need to resolve a 2D matrix using Jacobi iteration method to resolve a heat conductivity problem using OpenMP.
Essentially It is a plate with four walls at the sides which have fixed temperatures and I need to work out the unknown temperature values in the middle.
The code has been given to us and what I am expected to do is three simple things:
- Time the serial code
- Parallelise the serial code and compare
- Further optimise the parallel code if possible
I have ran the serial code and parallelised the code to make a comparison. Before any optimisation, for some reason the serial code is consistently doing better. I can't help but think I am doing something wrong?
I will try compiler optimisations for both, but I expected parallel code to be faster. I have chosen a large problem size for the matrix including a 100 x 100, 300 x 300 array and every single time almost the serial code is doing better.
Funny thing is, the more threads I add, the slower it gets.
I understand for a small problem size it would be a larger overhead, but I thought this is a large enough problem size?
This is before any significant optimisation, am I doing something obviously wrong that makes it like this?
Here is the code:
Serial code:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <omp.h>
int main(int argc, char *argv[])
{
int m;
int n;
double tol;// = 0.0001;
int i, j, iter;
m = atoi(argv[1]);
n = atoi(argv[2]);
tol = atof(argv[3]);
/**
* @var t, tnew, diff, diffmax,
* t is the old temprature array,tnew is the new array
*/
double t[m+2][n+2], tnew[m+1][n+1], diff, difmax;
/**
* Timer variables
* @var start, end
*/
double start, end;
printf("%d %d %lf\n",m,n, tol);
start = omp_get_wtime();
// initialise temperature array
for (i=0; i <= m+1; i++) {
for (j=0; j <= n+1; j++) {
t[i][j] = 30.0;
}
}
// fix boundary conditions
for (i=1; i <= m; i++) {
t[i][0] = 33.0;
t[i][n+1] = 42.0;
}
for (j=1; j <= n; j++) {
t[0][j] = 20.0;
t[m+1][j] = 25.0;
}
// main loop
iter = 0;
difmax = 1000000.0;
while (difmax > tol) {
iter++;
// update temperature for next iteration
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j] + t[i+1][j] + t[i][j-1] + t[i][j+1]) / 4.0;
}
}
// work out maximum difference between old and new temperatures
difmax = 0.0;
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > difmax) {
difmax = diff;
}
// copy new to old temperatures
t[i][j] = tnew[i][j];
}
}
}
end = omp_get_wtime();
// print results,
//Loop tempratures commented out to save performance
printf("iter = %d difmax = %9.11lf\n", iter, difmax);
printf("Time in seconds: %lf \n", end - start);
// for (i=0; i <= m+1; i++) {
// printf("\n");
// for (j=0; j <= n+1; j++) {
// printf("%3.5lf ", t[i][j]);
// }
// }
// printf("\n");
}
Here is the parallel code:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <omp.h>
int main(int argc, char *argv[])
{
int m;
int n;
double tol;// = 0.0001;
/**
* @brief Integer variables
* @var i external loop (y column array) counter,
* @var j internal loop (x row array counter) counter,
* @var iter number of iterations,
* @var numthreads number of threads
*/
int i, j, iter, numThreads;
m = atoi(argv[1]);
n = atoi(argv[2]);
tol = atof(argv[3]);
numThreads = atoi(argv[4]);
/**
* @brief Double variables
* @var t, tnew -> The variable that holds the temprature, the t is the old value and the tnew is the new value,
* @var diff Measures the difference,
* @var diffmax
* t is the temprature array, I guess it holds the matrix?
*
*/
double t[m+2][n+2], tnew[m+1][n+1], diff, diffmax, privDiffmax;
/**
* Timer variables
* @var start, end
*/
double start, end;
/**
* @brief Print the problem size & the tolerance
* This print statement can be there as it is not part of the parallel region
* We also print the number of threads when printing the problem size & tolerance
*/
//printf("%d %d %lf %d\n",m,n, tol, numThreads);
omp_set_num_threads(numThreads);
/**
* @brief Initialise the timer
*
*/
start = omp_get_wtime();
/**
* @brief Creating the parallel region:
* Here both loop counters are private:
*/
#pragma omp parallel private(i, j)
{
/**
* @brief initialise temperature array
* This can be in a parallel region by itself
*/
#pragma omp for collapse(2) schedule(static)
for (i=0; i <= m+1; i++) {
for (j=0; j <= n+1; j++) {
t[i][j] = 30.0;
}
}
// fix boundary conditions
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
t[i][0] = 33.0;
t[i][n+1] = 42.0;
}
#pragma omp for schedule(static)
for (j=1; j <= n; j++) {
t[0][j] = 20.0;
t[m+1][j] = 25.0;
}
}
// main loop
iter = 0;
diffmax = 1000000.0;
while (diffmax > tol) {
iter = iter + 1;
/**
* @brief update temperature for next iteration
* Here we have created a parallel for directive, this is the second parallel region
*/
#pragma omp parallel for private(i, j) collapse(2) schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j] + t[i+1][j] + t[i][j-1] + t[i][j+1]) / 4.0;
}
}
// work out maximum difference between old and new temperatures
diffmax = 0.0;
/**
* @brief Third parallel region that compares the difference
*/
#pragma omp parallel private(i, j, privDiffmax, diff)
{
privDiffmax = 0.0;
#pragma omp for collapse(2) schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > privDiffmax) {
privDiffmax = diff;
}
// copy new to old temperatures
t[i][j] = tnew[i][j];
}
}
#pragma omp critical
if (privDiffmax > diffmax)
{
diffmax = privDiffmax;
}
}
}
//Add timer for the end
end = omp_get_wtime();
// print results,
//Loop tempratures commented out to save performance
printf("iter = %d diffmax = %9.11lf\n", iter, diffmax);
printf("Time in seconds: %lf \n", end - start);
// for (i=0; i <= m+1; i++) {
// printf("\n");
// for (j=0; j <= n+1; j++) {
// printf("%3.5lf ", t[i][j]);
// }
// }
// printf("\n");
}
Here are some of the benchmarks for serial code:
I have ran the code and tested it, I have commented out the print statements as I dont need to see that except for to test. The code runs fine but somehow it is slower than the serial code.
I have an 8 core Apple Mac M1
I am new to OpenMP and can't help but think I am missing something. Any advice would be appreciated