I optimized my kernel following this document: http://www.cuvilib.com/Reduction.pdf
The same document (with smaller file size) can also be found here: https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
It looks like this for now:
template<unsigned int blockSize>
__global__ void minMaxReduction_k(float *g_minData, float *g_maxData, float *g_minOutput, float *g_maxOutput, unsigned int n)
{
extern __shared__ float shared[];
float* minSdata = (float*)shared;
float* maxSdata = (float*)&minSdata[4*blockDim.x];
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
minSdata[4*tid] = FLT_MAX;
minSdata[4*tid+1] = FLT_MAX;
minSdata[4*tid+2] = FLT_MAX;
maxSdata[4*tid] = -FLT_MAX;
maxSdata[4*tid+1] = -FLT_MAX;
maxSdata[4*tid+2] = -FLT_MAX;
while(i<n){
minSdata[4*tid] = fminf(fminf(minSdata[4*tid], g_minData[4*i]), g_minData[4*(i+blockDim.x)]);
minSdata[4*tid+1] = fminf(fminf(minSdata[4*tid+1], g_minData[4*i+1]), g_minData[4*(i+blockDim.x)+1]);
minSdata[4*tid+2] = fminf(fminf(minSdata[4*tid+2], g_minData[4*i+2]), g_minData[4*(i+blockDim.x)+2]);
maxSdata[4*tid] = fmaxf(fmaxf(maxSdata[4*tid], g_maxData[4*i]), g_maxData[4*(i+blockDim.x)]);
maxSdata[4*tid+1] = fmaxf(fmaxf(maxSdata[4*tid+1], g_maxData[4*i+1]), g_maxData[4*(i+blockDim.x)+1]);
maxSdata[4*tid+2] = fmaxf(fmaxf(maxSdata[4*tid+2], g_maxData[4*i+2]), g_maxData[4*(i+blockDim.x)+2]);
i+=gridSize;
}
__syncthreads();
if(blockSize >= 1024){
if(tid < 512){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+512)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+512)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+512)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+512)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+512)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+512)+2]);
}
__syncthreads();
}
if(blockSize >= 512){
if(tid < 256){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+256)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+256)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+256)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+256)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+256)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+256)+2]);
}
__syncthreads();
}
if(blockSize >= 256){
if(tid < 128){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+128)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+128)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+128)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+128)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+128)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+128)+2]);
}
__syncthreads();
}
if(blockSize >= 128){
if(tid < 64){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+64)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+64)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+64)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+64)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+64)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+64)+2]);
}
__syncthreads();
}
if(tid<32){
if (blockSize >= 64){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+32)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+32)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+32)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+32)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+32)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+32)+2]);
}
//
if (blockSize >= 32){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+16)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+16)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+16)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+16)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+16)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+16)+2]);
}
//
if (blockSize >= 16){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+8)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+8)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+8)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+8)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+8)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+8)+2]);
}
//
if (blockSize >= 8){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+4)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+4)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+4)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+4)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+4)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+4)+2]);
}
//
if (blockSize >= 4){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+2)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+2)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+2)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+2)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+2)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+2)+2]);
}
//
if (blockSize >= 2){
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+1)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+1)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+1)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+1)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+1)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+1)+2]);
}
}
// write result for this block to global mem
if (tid == 0){
g_minOutput[blockIdx.x] = minSdata[0];
g_minOutput[blockIdx.x+1] = minSdata[1];
g_minOutput[blockIdx.x+2] = minSdata[2];
g_maxOutput[blockIdx.x] = maxSdata[0];
g_maxOutput[blockIdx.x+1] = maxSdata[1];
g_maxOutput[blockIdx.x+2] = maxSdata[2];
}
}
Invoked like this:
float *d_minOutput;
float *d_maxOutput;
int tempN = n;
while(tempN>1){
getNumBlocksAndThreads(tempN, 65535, 1024, blocks, threads);
cudaMalloc((void **)&d_minOutput, 4*(sizeof(float)*blocks));
cudaMalloc((void **)&d_maxOutput, 4*(sizeof(float)*blocks));
int smem = (threads <= 32) ? 2*2*4*threads*sizeof(float) : 2*4*threads*sizeof(float);
switch(threads){
case 1024:
minMaxReduction_k<1024><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 512:
minMaxReduction_k<512><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 256:
minMaxReduction_k<256><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 128:
minMaxReduction_k<128><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 64:
minMaxReduction_k<64><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 32:
minMaxReduction_k<32><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 16:
minMaxReduction_k<16><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 8:
minMaxReduction_k<8><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 4:
minMaxReduction_k<4><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 2:
minMaxReduction_k<2><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 1:
minMaxReduction_k<1><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
}
tempN = blocks;
cudaMemcpy(d_minData, d_minOutput, 4*(sizeof(float)*blocks), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_maxData, d_maxOutput, 4*(sizeof(float)*blocks), cudaMemcpyDeviceToDevice);
cudaFree(d_minOutput);
cudaFree(d_maxOutput);
}
Helpers:
void UniformGrid::getNumBlocksAndThreads(unsigned int n, unsigned int maxBlocks, unsigned int maxThreads, unsigned int &blocks, unsigned int &threads)
{
//get device capability, to avoid block/grid size excceed the upbound
cudaDeviceProp prop;
int device;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
blocks = (n + (threads * 2 - 1)) / (threads * 2);
if ((float)threads*blocks > (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock)
{
printf("n is too large, please choose a smaller number!\n");
}
if (blocks > (unsigned int) prop.maxGridSize[0])
{
printf("Grid size <%d> excceeds the device capability <%d>, set block size as %d (original %d)\n",
blocks, prop.maxGridSize[0], threads*2, threads);
blocks /= 2;
threads *= 2;
}
}
unsigned int UniformGrid::nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
In a few days I'll probably test which solution is faster.