Because of the data dependencies and the low computational requirements, this is unlikely to ever give you much of a speedup in multicore - however, you can do something by calculating within each chunk of the array the local min, max, and local region best, and then compare that across chunks. Because of the final step, this runs in O(N) + O(P2) time, further limiting scalability.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <limits.h>
#include <omp.h>
void tick(struct timeval *t);
double tock(const struct timeval * const t);
unsigned int maxDiff(const int * const arr, const int arr_size)
{
int max_diff = arr[1] - arr[0];
int min_element = arr[0];
int i;
for(i = 1; i < arr_size; i++)
{
if(arr[i] - min_element > max_diff)
max_diff = arr[i] - min_element;
if(arr[i] < min_element)
min_element = arr[i];
}
return max_diff;
}
unsigned int ompMaxDiff(const int * const arr, const int arr_size)
{
int nthreads=omp_get_max_threads();
int maxes[nthreads];
int mins [nthreads];
unsigned int best = 0;
for (int i=0; i<nthreads; i++) {
mins [i] = INT_MAX;
maxes[i] = INT_MIN;
}
#pragma omp parallel num_threads(nthreads) default(none) shared(mins, maxes) reduction(max:best)
{
int idx = omp_get_thread_num();
int min = INT_MAX, max = INT_MIN;
#pragma omp for schedule(static)
for(int i=0; i<arr_size; i++) {
if (arr[i] < min) min=arr[i];
if (arr[i] > max) max=arr[i];
if ((arr[i] - min) > best) best = arr[i] - min;
}
mins [idx] = min;
maxes[idx] = max;
}
for (int i=0; i<nthreads-1; i++)
for (int j=i+1; j<nthreads; j++)
if ((maxes[j] - mins[i]) > best) best = maxes[j]-mins[i];
return best;
}
int main(int argc, char **argv) {
const int nitems=1000000;
int *data = malloc(nitems*sizeof(int));
srand(time(NULL));
for (int i=0; i<nitems; i++)
data[i] = rand() % 500; /* numbers between 0 and 500 */
data[(nitems/2)+1] = -700;
data[(nitems/2)] = 700; /* a trick! shouldn't get 1400, */
/* should get <= 1200 */
struct timeval start;
tick(&start);
unsigned int res = maxDiff(data, nitems);
double restime = tock(&start);
printf("Serial: answer = %u, time = %lf\n", res, restime);
tick(&start);
res = ompMaxDiff(data, nitems);
restime = tock(&start);
printf("OpenMP: answer = %u, time = %lf\n", res, restime);
free(data);
return 0;
}
void tick(struct timeval *t) {
gettimeofday(t, NULL);
}
double tock(const struct timeval * const t) {
struct timeval now;
gettimeofday(&now, NULL);
return (double)(now.tv_sec - t->tv_sec) + ((double)(now.tv_usec - t->tv_usec)/1000000.);
}
Running on 8 cores gives:
$ gcc -fopenmp -O3 -Wall -std=c11 maxdiff.c -o maxdiff
$ ./maxdiff
Serial: answer = 1199, time = 0.001760
OpenMP: answer = 1199, time = 0.000488