How to find median value in 2d array for each column with CUDA?

Question

I am trying to find median value for every column in 2D array with cuda.
size of 2D array = [1000,500]

So I found quicksort cuda code at CUDA SAMPLES.(C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.2\0_Simple\cdpSimpleQuicksort)
It sorts 1-d vector and I can find just median value with indexing.

I changed SAMPLE code like below

Execute kernel with 500 threads. Each kernel sorting vector(1000 elements)
cdp_simple_quicksort << < 1, 500 >> > (data, left, right, 0);
Inside of kernel, indexing like &data[1000 * idx_x] for each thread can sort 1000 elements.
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)

Problems

Execution time is too long for my case comparison with the single vector case.
Single execution time for 1000 = about 700us.
But 1000*500(500 threads) = 280ms.

Questions

Why it takes too long compared to simple case?
Is there any better way to improve performance?(I am using cufft before this sorting, so I can not use thrust library)

// Selection sort used when depth gets too big or the number of elements drops
// below a threshold.
////////////////////////////////////////////////////////////////////////////////
__device__ void selection_sort(unsigned int *data, int left, int right)
{
    for (int i = left; i <= right; ++i)
    {
        unsigned min_val = data[i];
        int min_idx = i;

        // Find the smallest value in the range [left, right].
        for (int j = i + 1; j <= right; ++j)
        {
            unsigned val_j = data[j];

            if (val_j < min_val)
            {
                min_idx = j;
                min_val = val_j;
            }
        }

        // Swap the values.
        if (i != min_idx)
        {
            data[min_idx] = data[i];
            data[i] = min_val;
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// Very basic quicksort algorithm, recursively launching the next level.
////////////////////////////////////////////////////////////////////////////////
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
{
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;

    if (idx_x < 500) {
        // If we're too deep or there are few elements left, we use an insertion sort...
        if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT)
        {
            selection_sort(&data[1000 * idx_x], left, right);
            return;
        }

        unsigned int *lptr = &data[1000 * idx_x] + left;
        unsigned int *rptr = &data[1000 * idx_x] + right;
        unsigned int  pivot = data[1000 * idx_x + (left + right) / 2];

        // Do the partitioning.
        while (lptr <= rptr)
        {
            // Find the next left- and right-hand values to swap
            unsigned int lval = *lptr;
            unsigned int rval = *rptr;

            // Move the left pointer as long as the pointed element is smaller than the pivot.
            while (lval < pivot)
            {
                lptr++;
                lval = *lptr;
            }

            // Move the right pointer as long as the pointed element is larger than the pivot.
            while (rval > pivot)
            {
                rptr--;
                rval = *rptr;
            }

            // If the swap points are valid, do the swap!
            if (lptr <= rptr)
            {
                *lptr++ = rval;
                *rptr-- = lval;
            }
        }

        // Now the recursive part
        int nright = rptr - &data[1000 * idx_x];
        int nleft = lptr - &data[1000 * idx_x];

        // Launch a new block to sort the left part.
        if (left < (rptr - &data[1000 * idx_x]))
        {
            cudaStream_t s;
            cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
            cdp_simple_quicksort << < 1, 1, 0, s >> > (data[1000 * idx_x], left, nright, depth + 1);
            cudaStreamDestroy(s);
        }

        // Launch a new block to sort the right part.
        if ((lptr - &data[1000 * idx_x]) < right)
        {
            cudaStream_t s1;
            cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
            cdp_simple_quicksort << < 1, 1, 0, s1 >> > (data[1000 * idx_x], nleft, right, depth + 1);
            cudaStreamDestroy(s1);
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// Call the quicksort kernel from the host.
////////////////////////////////////////////////////////////////////////////////
void run_qsort(unsigned int *data, unsigned int nitems)
{
    // Prepare CDP for the max depth 'MAX_DEPTH'.
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH);

    // Launch on device
    int left = 0;
    int test = 1000; // temporarily change for my case
    int right = test - 1;
    std::cout << "Launching kernel on the GPU" << std::endl;
    cdp_simple_quicksort << < 1, 500 >> > (data, left, right, 0); // excute 500 thread for sorting each column
    cudaDeviceSynchronize();
}

////////////////////////////////////////////////////////////////////////////////
// Initialize data on the host.
////////////////////////////////////////////////////////////////////////////////
void initialize_data(unsigned int *dst, unsigned int nitems)
{
    // Fixed seed for illustration
    srand(2047);

    // Fill dst with random values
    for (unsigned i = 0; i < nitems; i++)
        dst[i] = rand() % nitems;
}

////////////////////////////////////////////////////////////////////////////////
// Verify the results.
////////////////////////////////////////////////////////////////////////////////
void check_results(int n, unsigned int *results_d)
{
    unsigned int *results_h = new unsigned[n];
    cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost);

    for (int i = 1; i < n; ++i)
        if (results_h[i - 1] > results_h[i])
        {
            std::cout << "Invalid item[" << i - 1 << "]: " << results_h[i - 1] << " greater than " << results_h[i] << std::endl;
            exit(EXIT_FAILURE);
        }

    std::cout << "OK" << std::endl;
    delete[] results_h;
}

////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    int ny = 1000;
    int nx = 500;
    int num_items = nx * ny;
    bool verbose = false;



    // Find/set device and get device properties
    int device = -1;
    cudaDeviceProp deviceProp;


    // Create input data
    unsigned int *h_data = 0;
    unsigned int *d_data = 0;

    // Allocate CPU memory and initialize data.
    std::cout << "Initializing data:" << std::endl;
    h_data = (unsigned int *)malloc(num_items * sizeof(unsigned int));
    initialize_data(h_data, num_items);

    if (verbose)
    {
        for (int i = 0; i < num_items; i++)
            std::cout << "Data [" << i << "]: " << h_data[i] << std::endl;
    }

    // Allocate GPU memory.
    cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int));
    cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice);

    // Execute
    std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
    run_qsort(d_data, num_items);

    // Check result
    std::cout << "Validating results: ";
    check_results(num_items, d_data);

    free(h_data);
    cudaFree(d_data);

    exit(EXIT_SUCCESS);
}

why does using cufft before this mean that you cannot use thrust? — Robert Crovella, Jul 16 '20 at 13:20
@Robert Crovella I am using the result of fft for sorting. And I tried to use cufft result array as input of thrust function, but I failed. Is it possible to use cudamalloc array as input of thrust? And also possible the opposite case? — powermew, Jul 16 '20 at 15:45
It's possible to use cufft result array as input to a thrust function. It's impossilbe to say what is wrong with code you haven't shown. The reason to use CUB or thrust is that they have reasonably good sort implementations, and the sort implementation you have using CDP is terrible, performance-wise. — Robert Crovella, Jul 16 '20 at 16:09
two objectives I would consider are: 1. figure out how to use CUB or thrust for the sorting activity 2. If at all possible, rearrange (transpose) your data so that the sorting functions can be done row-wise, instead of column wise. With those changes, we can get the sort time down to ~10 milliseconds or less, depending on the GPU you are running on. Doing 500 row-wise sorts of 1024 elements each using CUB on a Tesla V100 can be done in less than 100 microseconds. — Robert Crovella, Jul 16 '20 at 17:07
@Robert Crovella I'm sorry that I totally misunderstood with cuda between thrust. I have tiny knowledge in cuda yet. — powermew, Jul 17 '20 at 04:43
@Robert Crovella Thank you for the guide. I will try it with your guide. I didn't knew that sorting function do row-wise. Is the CUB a Cublas? — powermew, Jul 17 '20 at 04:47
cub is not cublas. It is a CUDA cooperative primitive library. See [here](https://github.com/NVlabs/cub). — Robert Crovella, Jul 17 '20 at 05:50
I know precious little about GPU programming, but I just wanted to point out that, for finding a median, quickselect is faster than quicksort. -- In case you're not aware, quickselect is a simplified version of quicksort, which - after each partitioning step - only processes the part with the desired index (here, the median). — 500 - Internal Server Error, Jul 17 '20 at 10:01
@RobertCrovella Oh that's amazing... Thank you so much. There is huge amount I should learn. — powermew, Jul 20 '20 at 01:27
@500-InternalServerError Oh I didn't knew that either. Thank you so much. — powermew, Jul 20 '20 at 01:27
@RobertCrovella I tried with thrust but it seems too slow. I posted my extra question on the below. — powermew, Jul 21 '20 at 12:36

How to find median value in 2d array for each column with CUDA?

0 Answers0

Linked