OpenMP : Parallel QuickSort

Question

I try to use OpenMP to parallelize QuickSort in partition part and QuickSort part. My C code is as follows:

#include "stdlib.h"
#include "stdio.h"
#include "omp.h"

// parallel partition
int ParPartition(int *a, int p, int r) {
    int b[r-p];
    int key = *(a+r); // use the last element in the array as the pivot
    int lt[r-p]; // mark 1 at the position where its element is smaller than the key, else 0
    int gt[r-p]; // mark 1 at the position where its element is bigger than the key, else 0
    int cnt_lt = 0; // count 1 in the lt array
    int cnt_gt = 0; // count 1 in the gt array
    int j=p;
    int k = 0; // the position of the pivot
    // deal with gt and lt array
    #pragma omp parallel for
    for ( j=p; j<r; ++j) {
        b[j-p] = *(a+j);
        if (*(a+j) < key) {
            lt[j-p] = 1;
            gt[j-p] = 0;
        } else {
            lt[j-p] = 0;
            gt[j-p] = 1;
        }
    }
    // calculate the new position of the elements
    for ( j=0; j<(r-p); ++j) {
        if (lt[j]) {
            ++cnt_lt;
            lt[j] = cnt_lt;
        } else
            lt[j] = cnt_lt;
        if (gt[j]) {
            ++cnt_gt;
            gt[j] = cnt_gt;
        } else
            gt[j] = cnt_gt;
    }
    // move the pivot
    k = lt[r-p-1];
    *(a+p+k) = key;
    // move elements to their new positon
    #pragma omp parallel for 
    for ( j=p; j<r; ++j) {
        if (b[j-p] < key)
            *(a+p+lt[j-p]-1) = b[j-p];
        else if (b[j-p] > key)
            *(a+k+gt[j-p]) = b[j-p];
    }
    return (k+p);
}

void ParQuickSort(int *a, int p, int r) {
    int q;
    if (p<r) {
        q = ParPartition(a, p, r);
        #pragma omp parallel sections
        {
        #pragma omp section
        ParQuickSort(a, p, q-1);
        #pragma omp section
        ParQuickSort(a, q+1, r);
        }
    }
}

int main() {
    int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
    ParQuickSort(a, 0, 9);
    int i=0;
    for (; i!=10; ++i)
        printf("%d\t", a[i]);
    printf("\n");
    return 0;
}

For the example in the main function, the sorting result is:

0   9   9   2   2   2   6   7   7   7

I used gdb to debug. In the early recursion, all went well. But in some recursions, it suddenly messed up to begin duplicate elements. Then generate the above result.

Can someone help me figure out where the problem is?

dreamcrash · Answer 1 · 2021-03-25T23:12:36.653

I decided to post this answer because:

the accepted answer is wrong, and the user seems inactive these days. There is a race-condition on

 #pragma omp parallel for
 for(i = p; i < r; i++){
     if(a[i] < a[r]){
         lt[lt_n++] = a[i]; //<- race condition lt_n is shared
     }else{
         gt[gt_n++] = a[i];  //<- race condition gt_n is shared
     }   
 }

Nonetheless, even if it was correct, the modern answer to this question is to use OpenMP tasks instead of sections.

I am providing the community with full runnable example of such approach including tests and profiling.

#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#define TASK_SIZE 100

unsigned int rand_interval(unsigned int min, unsigned int max)
{
    // https://stackoverflow.com/questions/2509679/
    int r;
    const unsigned int range = 1 + max - min;
    const unsigned int buckets = RAND_MAX / range;
    const unsigned int limit = buckets * range;

    do
    {
        r = rand();
    } 
    while (r >= limit);

    return min + (r / buckets);
}

void fillupRandomly (int *m, int size, unsigned int min, unsigned int max){
    for (int i = 0; i < size; i++)
    m[i] = rand_interval(min, max);
} 


void init(int *a, int size){
   for(int i = 0; i < size; i++)
       a[i] = 0;
}

void printArray(int *a, int size){
   for(int i = 0; i < size; i++)
       printf("%d ", a[i]);
   printf("\n");
}

int isSorted(int *a, int size){
   for(int i = 0; i < size - 1; i++)
      if(a[i] > a[i + 1])
        return 0;
   return 1;
}


int partition(int * a, int p, int r)
{
    int lt[r-p];
    int gt[r-p];
    int i;
    int j;
    int key = a[r];
    int lt_n = 0;
    int gt_n = 0;

    for(i = p; i < r; i++){
        if(a[i] < a[r]){
            lt[lt_n++] = a[i];
        }else{
            gt[gt_n++] = a[i];
        }   
    }   

    for(i = 0; i < lt_n; i++){
        a[p + i] = lt[i];
    }   

    a[p + lt_n] = key;

    for(j = 0; j < gt_n; j++){
        a[p + lt_n + j + 1] = gt[j];
    }   

    return p + lt_n;
}

void quicksort(int * a, int p, int r)
{
    int div;

    if(p < r){ 
        div = partition(a, p, r); 
        #pragma omp task shared(a) if(r - p > TASK_SIZE) 
        quicksort(a, p, div - 1); 
        #pragma omp task shared(a) if(r - p > TASK_SIZE)
        quicksort(a, div + 1, r); 
    }
}

int main(int argc, char *argv[])
{
        srand(123456);
        int N  = (argc > 1) ? atoi(argv[1]) : 10;
        int print = (argc > 2) ? atoi(argv[2]) : 0;
        int numThreads = (argc > 3) ? atoi(argv[3]) : 2;
        int *X = malloc(N * sizeof(int));
        int *tmp = malloc(N * sizeof(int));

        omp_set_dynamic(0);              /** Explicitly disable dynamic teams **/
        omp_set_num_threads(numThreads); /** Use N threads for all parallel regions **/

         // Dealing with fail memory allocation
        if(!X || !tmp)
        { 
           if(X) free(X);
           if(tmp) free(tmp);
           return (EXIT_FAILURE);
        }

        fillupRandomly (X, N, 0, 5);

        double begin = omp_get_wtime();
        #pragma omp parallel
        {
            #pragma omp single
             quicksort(X, 0, N);
        }   
        double end = omp_get_wtime();
        printf("Time: %f (s) \n",end-begin);
    
        assert(1 == isSorted(X, N));

        if(print){
           printArray(X, N);
        }

        free(X);
        free(tmp);
        return (EXIT_SUCCESS);

    return 0;
}

How to run:

This program accepts three parameters:

The size of the array;
Print or not the array, 0 for no, otherwise yes;
The number of Threads to run in parallel.

Mini Benchmark

In a 4 core machine : Input 100000 with

1 Thread  -> Time: 0.784504 (s)
2 Threads -> Time: 0.424008 (s) ~ speedup 1.85x
4 Threads -> Time: 0.282944 (s) ~ speedup 2.77x

score 2 · Accepted Answer · answered Apr 15 '13 at 08:50

I feel sorry for my first comment.It does not matter with your problem.I have not found the true problem of your question(Maybe your move element has the problem).According to your opinion, I wrote a similar program, it works fine.(I am also new on OpenMP).

#include <stdio.h>
#include <stdlib.h>


int partition(int * a, int p, int r)
{
    int lt[r-p];
    int gt[r-p];
    int i;
    int j;
    int key = a[r];
    int lt_n = 0;
    int gt_n = 0;

#pragma omp parallel for
    for(i = p; i < r; i++){
        if(a[i] < a[r]){
            lt[lt_n++] = a[i];
        }else{
            gt[gt_n++] = a[i];
        }   
    }   

    for(i = 0; i < lt_n; i++){
        a[p + i] = lt[i];
    }   

    a[p + lt_n] = key;

    for(j = 0; j < gt_n; j++){
        a[p + lt_n + j + 1] = gt[j];
    }   

    return p + lt_n;
}

void quicksort(int * a, int p, int r)
{
    int div;

    if(p < r){ 
        div = partition(a, p, r); 
#pragma omp parallel sections
        {   
#pragma omp section
            quicksort(a, p, div - 1); 
#pragma omp section
            quicksort(a, div + 1, r); 

        }
    }
}

int main(void)
{
    int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
    int i;

    quicksort(a, 0, 9);

    for(i = 0;i < 10; i++){
        printf("%d\t", a[i]);
    }
    printf("\n");
    return 0;
}

I see no reason why this would ever work. In the `parallel for`, `lt_n` and `gt_n` are possibly modified by more than one threads without any synchronization. Maybe the array is just so small that only one thread is working on that section. — ftfish, Feb 03 '15 at 17:38
**Update:** I ran this multiple times and indeed saw wrong result: `0 1 2 3 5 6 6 7 8 9`. Therefore the code is **wrong**. @randomp — ftfish, Feb 03 '15 at 17:41
@ftfish yep, this code seems wrong, I have posted a solution if you want to check — dreamcrash, Mar 21 '21 at 08:34

score 0 · Answer 3 · answered Apr 15 '13 at 14:16

I've implemented parallel quicksort in a production environment, although with concurrent processes (i.e. fork() and join()) and not OpenMP. I also found a pretty good pthread solution, but a concurrent process solution was the best in terms of worst-case runtime. Let me start by saying that it doesn't seem like you're making copies of your input array for each thread, so you'll definitely encounter race conditions which can corrupt your data.

Essentially, what is happening is you have created an array N in shared memory, and when you do a #pragma omp parallel sections, you're spawning as many worker threads as there are #pragma omp section's. Each time a worker thread tries to access and modify elements of a, it will execute a series of instructions: "read the n'th value of N from the given address", "modify the n'th value of N", "write the n'th value of N back to the given address". Since you have multiple threads with no locking or synchronization, the read, modify, and write instructions may be executed in any order by multiple processors, so the threads may overwrite each other's modifications or read a non-updated value.

The best solution that I found (after many weeks of testing and benchmarking many solutions that I came up with) is to subdivide the list log(n) times, where n is the number of processors. For example, if you have a quad core machine (n = 4), subdivide the list 2 times (log(4) = 2) choosing pivots that are the medians of the data set. It is important that the pivots are medians, because otherwise you can end up with a case where a poorly chosen pivot causes the lists to be distributed unevenly amongst processes. Then each process does quicksort on its local subarray, then merges its results with the results of other processes. This is called "hyperquicksort", and from an initial github search, I found this. I can't vouch for the code in there, and can't publish any of the code that I wrote since it is protected under an NDA.

By the way, one of the best parallel sorting algorithm is PSRS (Parallel Sorting by Regular Sampling), which keeps list sizes more balanced amongst processes, doesn't unnecessarily communicate keys between processes, and can work on an arbitrary number of concurrent processes (they don't necessarily have to be a power of 2).

This seems to be wrong. The algorithm divides the array to be sorted in smaller partial list of the initial list. As long as the spawned threads just swap objects inside their "own" partial list, no race conditions can arise. — Quxflux, Oct 28 '14 at 12:20

OpenMP : Parallel QuickSort

3 Answers3

Linked