I have a matrix of size 50000x100 and I need to sort each row using Cuda in C++. My architecture is a K80 NVidia card.
Since the number of columns is small, I am currently running the sorting algorithm inside a kernel. I am using a modified bubble algorithm that runs on all lines of the matrix.
I am wondering if there is an more efficient way to proceed. I tried to use thrust::sort inside my kernel but it is much slower. I also tried a merge sort algorithm but the recursive part of the algorithm didn't work inside my kernel.
==edit==
here is my kernel:
__global__ void computeQuantilesKernel(float *matIn, int nRows, int nCols, int nQuantiles, float *outsideValues, float *quantilesAve, int param2)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float values[100];//big enough for 100 columns
int keys[100];
int nQuant[100];//big enough for 100 quantiles (percentiles)
float thisQuantile[100];
int quant;
if (idx >= nRows) return;
//read matIn from global memory
for (int i = 0; i < nCols; i++)
{
values[i] = matIn[idx * nCols + i + param2 * nCols * nRows];
keys[i] = i;
}
//bubble Sort:
int i, j;
int temp;
float tempVal;
for (i = 0; i < nCols - 1; i++)
{
for (j = 0; j < nCols - i - 1; j++)
{
if (values[j + 1] < values[j]) // ascending order simply changes to <
{
tempVal = values[j]; // swap elements
temp = keys[j]; // swap elements
values[j] = values[j + 1];
keys[j] = keys[j + 1];
values[j + 1] = tempVal;
keys[j + 1] = temp;
}
}
}
//end of bubble sort
//reset nQuant and thisQuantile
for (int iQuant = 0; iQuant < nQuantiles; iQuant++)
{
nQuant[iQuant] = 0;
thisQuantile[iQuant] = 0;
}
//Compute sum of outsideValues for each quantile
for (int i = 0; i < nCols; i++)
{
quant = (int)(((float)i + 0.5) / ((float)nCols / (float)nQuantiles));//quantile like Matlab
nQuant[quant]++;
thisQuantile[quant] += outsideValues[idx * nCols + keys[i]];
}
//Divide by the size of each quantile to get averages
for (int iQuant = 0; iQuant < nQuantiles; iQuant++)
{
quantilesAve[idx + nRows * iQuant + param2 * nQuantiles * nRows] = thisQuantile[iQuant] / (float)nQuant[iQuant];
}
}