I am writing code for n-ary search i.e splitting search space into n parts. When compared parallel code with the code without OpenMP directives i.e serial execution , I observed that parallel code is slower than serial code many times. After executing both programs many times I saw little speedup in parallel code but not every time. This is maybe due to cache hierarchy. I am running program on quad-core processor with 4GB RAM.
According to answer on No speedup with OpenMP Memory bound performance and Load balancing is not applicable for small problems such as with array SIZE 100
. I have not used any synchronization. I also tried increasing array size to 10000000 but output is not always faster with parallel code. Many times serial code beats parallel code.
According to http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-loop.html The implicit barrier at the end of a work sharing construct can be cancelled with a nowait
clause. I tried adding nowait
clause also I also tried schedule(dynamic) and schedule(auto) referring to https://software.intel.com/en-us/articles/openmp-loop-scheduling but still same problem.
Code:
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define SIZE 100
#define NUM_THREADS 4
int* a;
int num;
void nary(int num)
{
int found = 0, low = 0, high = SIZE, step;
int i = 0;
while(!found && low <= high)
{
step = (high-low)/NUM_THREADS;
printf("Low :- %d\tHigh :- %d\tStep :- %d\n", low,high,step);
printf("\n");
#pragma omp parallel for num_threads(NUM_THREADS) shared(low,high,step)
for (i = 0; i < NUM_THREADS; ++i)
{
printf("First element :- %d by thread :- %d\n", a[low+step*i],omp_get_thread_num());
if (a[low+step*i] == num)
{
found = 1;
}
}
printf("\n");
/* First block */
if (a[low+step] > num)
{
high = low + step - 1;
printf("First \nLow :- %d \nHigh :- %d\n\n",low,high);
}
/* Last block */
else if (a[low+step*(NUM_THREADS-1)] < num)
{
low = low + step * (NUM_THREADS-1) + 1;
printf("Last\nLow :- %d \nHigh :- %d\n\n",low,high);
}
/* Middle blocks */
else{
#pragma omp parallel for num_threads(NUM_THREADS) schedule(static) shared(low,high,step)
for (i = 1; i < (NUM_THREADS-1); ++i)
{
if (a[low+step*i] < num && a[low+step*(i+1)] > num)
{
low = low + step*i + 1;
high = low + step*(i+1) - 1;
}
}
printf("middle\nLow :- %d \nHigh :- %d\n\n",low,high);
}
}
if (found == 1)
{
printf("Element found\n");
}
else
{
printf("Element Not found\n");
}
}
int main()
{
int i = 0;
int startTime = omp_get_wtime();
/* Dynamically allocate memory using malloc() */
a = (int*)malloc(sizeof(int) * SIZE);
#pragma omp parallel for schedule(static)
for (i = 0; i < SIZE; ++i)
{
a[i] = i;
}
printf("Enter the element to be searched :- \n");
scanf("%d", &num);
nary(num);
printf("\nExecution time :- %f\n", omp_get_wtime()-startTime);
return 0;
}
Parallel Execution Output:
Enter the element to be searched :-
20
Low :- 0 High :- 100 Step :- 25
First element :- 0 by thread :- 0
First element :- 50 by thread :- 2
First element :- 25 by thread :- 1
First element :- 75 by thread :- 3
First
Low :- 0
High :- 24
Low :- 0 High :- 24 Step :- 6
First element :- 6 by thread :- 1
First element :- 18 by thread :- 3
First element :- 0 by thread :- 0
First element :- 12 by thread :- 2
Last
Low :- 19
High :- 24
Low :- 19 High :- 24 Step :- 1
First element :- 20 by thread :- 1
First element :- 21 by thread :- 2
First element :- 19 by thread :- 0
First element :- 22 by thread :- 3
middle
Low :- 19
High :- 24
Element found
Execution time :- 26.824379
Serial Execution Output:
Enter the element to be searched :-
20
Low :- 0 High :- 100 Step :- 25
First element :- 0 by thread :- 0
First element :- 25 by thread :- 0
First element :- 50 by thread :- 0
First element :- 75 by thread :- 0
First
Low :- 0
High :- 24
Low :- 0 High :- 24 Step :- 6
First element :- 0 by thread :- 0
First element :- 6 by thread :- 0
First element :- 12 by thread :- 0
First element :- 18 by thread :- 0
Last
Low :- 19
High :- 24
Low :- 19 High :- 24 Step :- 1
First element :- 19 by thread :- 0
First element :- 20 by thread :- 0
First element :- 21 by thread :- 0
First element :- 22 by thread :- 0
middle
Low :- 19
High :- 24
Element found
Execution time :- 4.349347
What can be the reason behind this? Is this because there are many conditional statements in the code or the for loop inside the conditional block?