CUDA min reduction and index of double array

Question

I am trying to implement a minimum reduction algorithm, taken from this answer. I cannot use thrust or other libraries for this project, so I have to stick to pure CUDA. The goal is to extend the code to very big arrays where in my experience, and on my machine(s), thrust is too slow for my purposes.

Here below you find the code, which generates a 4096-elements double array where each element equals his index (i.e. [0,1,2,3,....,4096-1]) and in which a value of -1 is artifically added at a given index (4091 in this case). However, the code seems not to be working. I am compiling it for my GPU (CC=5.0) on a Windows machine with cuda 11 and Visual Studio 2017 with nvcc -w -arch=sm_50 input.cu -o output.exe

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
//#include <unistd.h>
//#include <sys/time.h>

#if __DEVICE_EMULATION__
#define DEBUG_SYNC __syncthreads();
#else
#define DEBUG_SYNC
#endif

#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif

#ifndef MIN_IDX
#define MIN_IDX(x,y, idx_x, idx_y) ((x < y) ? idx_x : idx_y)
#endif

#if (__CUDA_ARCH__ < 200)
#define int_mult(x,y)   __mul24(x,y)
#else
#define int_mult(x,y)   x*y
#endif
#define inf 0x7f800000

bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}

unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory {
    __device__ inline operator T *() {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }

    __device__ inline operator const T *() const {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }
};

// specialize for double to avoid unaligned memory
// access compile errors
template<>
struct SharedMemory<double> {
    __device__ inline operator double *() {
        extern __shared__ double __smem_d[];
        return (double *) __smem_d;
    }

    __device__ inline operator const double *() const {
        extern __shared__ double __smem_d[];
        return (double *) __smem_d;
    }
};

template<class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata,int *g_oIdxs, unsigned int n) {
    T *sdata = SharedMemory<T>();
    int *sdataIdx = ((int *)sdata) + blockSize;

    //int *sdataIdx = SharedMemory<int>();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    T myMin = 99999;
    int myMinIdx = -1;
    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n) {
        myMinIdx  = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx);
        myMin = MIN(g_idata[i], myMin);

        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n){
            //myMin += g_idata[i + blockSize];
            myMinIdx  = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx);
            myMin = MIN(g_idata[i + blockSize], myMin);
        }

        i += gridSize;
    }

    // each thread puts its local sum into shared memory
    sdata[tid] = myMin;
    sdataIdx[tid] = myMinIdx;
    __syncthreads();
    // do reduction in shared mem
    if ((blockSize >= 2048) && (tid < 1024)) {
            //sdata[tid] = mySum = mySum + sdata[tid + 256];
        
            sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512], myMin, sdataIdx[tid + 512], myMinIdx);
            sdata[tid] = myMin = MIN(sdata[tid + 512], myMin);
        }
        
        __syncthreads();


        // do reduction in shared mem
    if ((blockSize >= 1024) && (tid < 512)) {
            //sdata[tid] = mySum = mySum + sdata[tid + 256];
    
            sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 512], myMin, sdataIdx[tid + 512], myMinIdx);
            sdata[tid] = myMin = MIN(sdata[tid + 512], myMin);
        }
    
        __syncthreads();

    // do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256)) {
        //sdata[tid] = mySum = mySum + sdata[tid + 256];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 256], myMin);
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        //sdata[tid] = myMin = myMin + sdata[tid + 128];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 128], myMin);
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        //sdata[tid] = myMin = myMin + sdata[tid + 64];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 64], myMin);
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300 )
    if (tid < 32) {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >= 64){
        //myMin += sdata[tid + 32];
            myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
            myMin = MIN(sdata[tid + 32], myMin);
        }
        // Reduce final warp using shuffle
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            //myMin += __shfl_down(myMin, offset);
            int tempMyMinIdx = __shfl_down(myMinIdx, offset);
            double tempMyMin = __shfl_down(myMin, offset);

            myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx);
            myMin = MIN(tempMyMin, myMin);
        }

    }
#else
    // fully unroll reduction within a single warp
    if ((blockSize >= 64) && (tid < 32))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 32];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 32], myMin);
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 16];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 16], myMin);
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 8];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 8], myMin);
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 4];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 4], myMin);
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 2];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 2], myMin);
    }

    __syncthreads();

    if ((blockSize >= 2) && ( tid < 1))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 1];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 1], myMin);
    }

    __syncthreads();
#endif

    __syncthreads();
    // write result for this block to global mem
    if (tid == 0){
        g_odata[blockIdx.x] = myMin;
        g_oIdxs[blockIdx.x] = myMinIdx;
    }
}

void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks,
        int maxThreads, int &blocks, int &threads) {

    //get device capability, to avoid block/grid size exceed the upper bound
    cudaDeviceProp prop;
    int device;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);

    if (whichKernel < 3) {
        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
        blocks = (n + threads - 1) / threads;
    } else {
        threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
        blocks = (n + (threads * 2 - 1)) / (threads * 2);
    }

    if ((float) threads * blocks
            > (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
        printf("n is too large, please choose a smaller number!\n");
    }

    if (blocks > prop.maxGridSize[0]) {
        printf(
                "Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n",
                blocks, prop.maxGridSize[0], threads * 2, threads);

        blocks /= 2;
        threads *= 2;
    }

    if (whichKernel == 6) {
        blocks = MIN(maxBlocks, blocks);
    }
}

template<class T>
void reduceMin(int size, int threads, int blocks, int whichKernel, T *d_idata,
        T *d_odata,  int *idxs,  int *oIdxs) {
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize =(threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); // shared mem. for the amount of double i have
    smemSize += threads*sizeof(int); // shared memory for the indexes
    if (isPow2(size)) {
        switch (threads) {
        case 2048:
            reduceMin6<T, 2048, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;
        case 1024:
            reduceMin6<T, 1024, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;   
        case 512:
            reduceMin6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 256:
            reduceMin6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 128:
            reduceMin6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 64:
            reduceMin6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 32:
            reduceMin6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 16:
            reduceMin6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 8:
            reduceMin6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 4:
            reduceMin6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 2:
            reduceMin6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 1:
            reduceMin6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;
        }
    } else {
        switch (threads) {
        case 2048:
            reduceMin6<T, 2048, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break ;     
        case 1024:
            reduceMin6<T, 1024, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break ;   
        case 512:
            reduceMin6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 256:
            reduceMin6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 128:
            reduceMin6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 64:
            reduceMin6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 32:
            reduceMin6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 16:
            reduceMin6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 8:
            reduceMin6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 4:
            reduceMin6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 2:
            reduceMin6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 1:
            reduceMin6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;
        }
    }

}

template<class T>
void reduceMINCPU(T *data, int size, T *min, int *idx)
{
    *min = data[0];
    int min_idx = 0;
    T c = (T)0.0;

    for (int i = 1; i < size; i++)
    {
        T y = data[i];
        T t = MIN(*min, y);
        min_idx = MIN_IDX(*min, y, min_idx, i);
        (*min) = t;
    }

    *idx = min_idx;

    return;
}


// Instantiate the reduction function for 3 types
template void
reduceMin<int>(int size, int threads, int blocks, int whichKernel, int *d_idata,
        int *d_odata, int *idxs,  int *oIdxs);

template void
reduceMin<float>(int size, int threads, int blocks, int whichKernel, float *d_idata,
        float *d_odata, int *idxs,  int *oIdxs);

template void
reduceMin<double>(int size, int threads, int blocks, int whichKernel, double *d_idata,
        double *d_odata,  int *idxs,  int *oIdxs);

int main(){
    int num_els=8*8*8*8;

    int maxThreads = 128;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 1000000;

    double* d_in = NULL;
    double* d_out = NULL;
    int *d_idxs = NULL;
    int *d_oIdxs = NULL;

    printf("%d elements\n", num_els);
    printf("%d threads (max)\n", maxThreads);

    int numBlocks = 0;
    int numThreads = 0;
    getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
            numThreads);
    printf("%d numBlocks \n", numBlocks);
    printf("%d numThreads \n", numThreads);

    cudaMalloc((void **) &d_in, num_els * sizeof(double));
    cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
    cudaMalloc((void **) &d_out, numBlocks * sizeof(double));
    cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));
    printf("Finished cudaMalloc");

    double* in = (double*) malloc(num_els * sizeof(double));
    int *idxs = (int*) malloc(num_els * sizeof(int));
    double* out = (double*) malloc(numBlocks * sizeof(double));
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int));

    for (int i = 0; i < num_els; i++) {
        in[i] = (double)i*1.0;
        idxs[i] = i;
    }
    in[num_els-5]=(double)-1.0;
    printf("Verify that the in array is correctly filled \n");
    for (int i = num_els-10; i < num_els; i++) {
   //
       printf("\n [%d] = %.4f", idxs[i], in[i]);
    }

    // copy data directly to device memory
    cudaMemcpy(d_in, in, num_els * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, numBlocks * sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);
    printf(" \n Finished cudaMemcopy \n");

    reduceMin<double>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);

    cudaMemcpy(out, d_out, numBlocks * sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i< numBlocks; i++){
    //for(int i=numBlocks-20; i< numBlocks; i++)
       printf("\n Reduce MIN GPU idx: %d  value: %f", oIdxs[i], out[i]);
    }
    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(d_idxs);

    free(in);
    free(out);
    return 0;

}

However, if I convert the code for minimizing and finding the index of a float array, it works correctly. Here is the main for the float version (which is the only different part, some variables and the array are delcared as "float" instead of "double"). This is compiled in the same way as the former.

int main(){
    int num_els=8*8*8*8;

    int maxThreads = 128;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 1000000;



    float* d_in = NULL;
    float* d_out = NULL;
    int *d_idxs = NULL;
    int *d_oIdxs = NULL;

    printf("%d elements\n", num_els);
    printf("%d threads (max)\n", maxThreads);

    int numBlocks = 0;
    int numThreads = 0;
    getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
            numThreads);
    printf("%d numBlocks \n", numBlocks);
    printf("%d numThreads \n", numThreads);

    cudaMalloc((void **) &d_in, num_els * sizeof(float));
    cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
    cudaMalloc((void **) &d_out, numBlocks * sizeof(float));
    cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));
    printf("Finished cudaMalloc");

    float* in = (float*) malloc(num_els * sizeof(float));
    int *idxs = (int*) malloc(num_els * sizeof(int));
    float* out = (float*) malloc(numBlocks * sizeof(float));
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int));

    for (int i = 0; i < num_els; i++) {
        in[i] = (float)i*1.0;
        idxs[i] = i;
    }
    in[num_els-5]=(float)-1.0;
    printf("Verify that the in array is correctly filled \n");
    for (int i = num_els-10; i < num_els; i++) {
   //
       printf("\n [%d] = %.4f", idxs[i], in[i]);
    }

    // copy data directly to device memory
    cudaMemcpy(d_in, in, num_els * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, numBlocks * sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);
    printf(" \n Finished cudaMemcopy \n");

    reduceMin<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);

    cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i< numBlocks; i++){
    //for(int i=numBlocks-20; i< numBlocks; i++)
       printf("\n Reduce MIN GPU idx: %d  value: %f", oIdxs[i], out[i]);
    }
    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(d_idxs);

    free(in);
    free(out);
    return 0;

}

score 1 · Accepted Answer · answered Sep 25 '20 at 19:15

The problem is here:

int *sdataIdx = ((int *)sdata) + blockSize;

this line of code is setting up a pointer so that shared memory can be divided into 2 logical pieces, one to hold the data being reduced to find the minimum (the first piece) and one to hold the corresponding indices (the second piece - what this pointer actually points to the start of).

To handle varying datatypes correctly, it should be:

int *sdataIdx = (int *)(((T *)sdata) + blockSize);

Since sdata is already a pointer of type T, we can simplify:

int *sdataIdx = (int *)(sdata + blockSize);

This reserves a block of shared memory of sufficient size to hold the correct datatype (double in this case), followed by a block of int datatype. The reason that float works is because float and int have the same size requirements.

Also note that this works for float, double and int reduction types. It will not necessarily work for other types.

This code also spits out many compile time warnings. These should not be ignored. But that is not the crux of the issue here.

Also note that it is possible find min or max plus index with many fewer lines of code, if you wish, such as here.

CUDA min reduction and index of double array

1 Answers1