I am trying to find median value for every column in 2D array with cuda.
size of 2D array = [1000,500]
So I found quicksort cuda code at CUDA SAMPLES.(C:\ProgramData\NVIDIA Corporation\CUDA Samples\v10.2\0_Simple\cdpSimpleQuicksort
)
It sorts 1-d vector and I can find just median value with indexing.
I changed SAMPLE code like below
- Execute kernel with 500 threads. Each kernel sorting vector(1000 elements)
cdp_simple_quicksort << < 1, 500 >> > (data, left, right, 0);
- Inside of kernel, indexing like
&data[1000 * idx_x]
for each thread can sort 1000 elements.
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
Problems
Execution time is too long for my case comparison with the single vector case.
Single execution time for 1000 = about 700us.
But 1000*500(500 threads) = 280ms.
Questions
- Why it takes too long compared to simple case?
- Is there any better way to improve performance?(I am using cufft before this sorting, so I can not use thrust library)
// Selection sort used when depth gets too big or the number of elements drops
// below a threshold.
////////////////////////////////////////////////////////////////////////////////
__device__ void selection_sort(unsigned int *data, int left, int right)
{
for (int i = left; i <= right; ++i)
{
unsigned min_val = data[i];
int min_idx = i;
// Find the smallest value in the range [left, right].
for (int j = i + 1; j <= right; ++j)
{
unsigned val_j = data[j];
if (val_j < min_val)
{
min_idx = j;
min_val = val_j;
}
}
// Swap the values.
if (i != min_idx)
{
data[min_idx] = data[i];
data[i] = min_val;
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Very basic quicksort algorithm, recursively launching the next level.
////////////////////////////////////////////////////////////////////////////////
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
if (idx_x < 500) {
// If we're too deep or there are few elements left, we use an insertion sort...
if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT)
{
selection_sort(&data[1000 * idx_x], left, right);
return;
}
unsigned int *lptr = &data[1000 * idx_x] + left;
unsigned int *rptr = &data[1000 * idx_x] + right;
unsigned int pivot = data[1000 * idx_x + (left + right) / 2];
// Do the partitioning.
while (lptr <= rptr)
{
// Find the next left- and right-hand values to swap
unsigned int lval = *lptr;
unsigned int rval = *rptr;
// Move the left pointer as long as the pointed element is smaller than the pivot.
while (lval < pivot)
{
lptr++;
lval = *lptr;
}
// Move the right pointer as long as the pointed element is larger than the pivot.
while (rval > pivot)
{
rptr--;
rval = *rptr;
}
// If the swap points are valid, do the swap!
if (lptr <= rptr)
{
*lptr++ = rval;
*rptr-- = lval;
}
}
// Now the recursive part
int nright = rptr - &data[1000 * idx_x];
int nleft = lptr - &data[1000 * idx_x];
// Launch a new block to sort the left part.
if (left < (rptr - &data[1000 * idx_x]))
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
cdp_simple_quicksort << < 1, 1, 0, s >> > (data[1000 * idx_x], left, nright, depth + 1);
cudaStreamDestroy(s);
}
// Launch a new block to sort the right part.
if ((lptr - &data[1000 * idx_x]) < right)
{
cudaStream_t s1;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cdp_simple_quicksort << < 1, 1, 0, s1 >> > (data[1000 * idx_x], nleft, right, depth + 1);
cudaStreamDestroy(s1);
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Call the quicksort kernel from the host.
////////////////////////////////////////////////////////////////////////////////
void run_qsort(unsigned int *data, unsigned int nitems)
{
// Prepare CDP for the max depth 'MAX_DEPTH'.
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH);
// Launch on device
int left = 0;
int test = 1000; // temporarily change for my case
int right = test - 1;
std::cout << "Launching kernel on the GPU" << std::endl;
cdp_simple_quicksort << < 1, 500 >> > (data, left, right, 0); // excute 500 thread for sorting each column
cudaDeviceSynchronize();
}
////////////////////////////////////////////////////////////////////////////////
// Initialize data on the host.
////////////////////////////////////////////////////////////////////////////////
void initialize_data(unsigned int *dst, unsigned int nitems)
{
// Fixed seed for illustration
srand(2047);
// Fill dst with random values
for (unsigned i = 0; i < nitems; i++)
dst[i] = rand() % nitems;
}
////////////////////////////////////////////////////////////////////////////////
// Verify the results.
////////////////////////////////////////////////////////////////////////////////
void check_results(int n, unsigned int *results_d)
{
unsigned int *results_h = new unsigned[n];
cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost);
for (int i = 1; i < n; ++i)
if (results_h[i - 1] > results_h[i])
{
std::cout << "Invalid item[" << i - 1 << "]: " << results_h[i - 1] << " greater than " << results_h[i] << std::endl;
exit(EXIT_FAILURE);
}
std::cout << "OK" << std::endl;
delete[] results_h;
}
////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
int ny = 1000;
int nx = 500;
int num_items = nx * ny;
bool verbose = false;
// Find/set device and get device properties
int device = -1;
cudaDeviceProp deviceProp;
// Create input data
unsigned int *h_data = 0;
unsigned int *d_data = 0;
// Allocate CPU memory and initialize data.
std::cout << "Initializing data:" << std::endl;
h_data = (unsigned int *)malloc(num_items * sizeof(unsigned int));
initialize_data(h_data, num_items);
if (verbose)
{
for (int i = 0; i < num_items; i++)
std::cout << "Data [" << i << "]: " << h_data[i] << std::endl;
}
// Allocate GPU memory.
cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int));
cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice);
// Execute
std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
run_qsort(d_data, num_items);
// Check result
std::cout << "Validating results: ";
check_results(num_items, d_data);
free(h_data);
cudaFree(d_data);
exit(EXIT_SUCCESS);
}