CUDA Maximum Reduction Algorithm Not Working

Question

A previous question asked how to find to find the maximum value of an array in CUDA efficiently: Finding max value in CUDA, the top response provided a link to a NVIDIA presentation on optimizing reduction kernels.

If you are using Visual Studio, simply remove the header reference, and everything between CPU EXECUTION.

I setup a variant which found the max, but it doesn't match what the CPU is finding:

// Returns the maximum value of
// an array of size n
float GetMax(float *maxes, int n)
{
    int i = 0;
    float max = -100000;
    for(i = 0; i < n; i++)
    {
        if(maxes[i] > max)
            max = maxes[i];
    }

    return max;
}

// Too obvious...
__device__ float MaxOf2(float a, float b)
{
    if(a > b)   return a;
    else            return b;
}


__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
    extern __shared__ float sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(BLOCKSIZE*2) + tid;
    unsigned int gridSize = BLOCKSIZE*2*gridDim.x;

    sdata[tid] = 0;

    //MMX(index,i)
    //MMX(index,i+blockSize)
    // Final Optimized Kernel
    while (i < n) {
        sdata[tid] = MaxOf2(g_idata[i], g_idata[i+BLOCKSIZE]);
        i += gridSize;
    }
    __syncthreads();

    if (BLOCKSIZE >= 512) { if (tid < 256) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
    if (BLOCKSIZE >= 256) { if (tid < 128) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
    if (BLOCKSIZE >= 128) { if (tid < 64) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 64]); } __syncthreads(); }

    if (tid < 32) {
        if (BLOCKSIZE >= 64) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 32]);
        if (BLOCKSIZE >= 32) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
        if (BLOCKSIZE >= 16 ) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
        if (BLOCKSIZE >= 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
        if (BLOCKSIZE >= 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
        if (BLOCKSIZE >= 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 1]);
    }

    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

I have a giant setup to test this algorithm:

#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <sys/time.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include "book.h"

#define ARRAYSIZE 16384
#define GRIDSIZE 60
#define BLOCKSIZE 32
#define SIZEFLOAT 4

using namespace std;

// Function definitions
float GetMax(float *maxes, int n);
__device__ float MaxOf2(float a, float b);
__global__ void MaxReduction(int n, float *g_idata, float *g_odata);

// Returns random floating point number
float RandomReal(float low, float high)
{
    float d;

    d = (float) rand() / ((float) RAND_MAX + 1);
    return (low + d * (high - low));
}

int main()
{
    /*****************VARIABLE SETUP*****************/
    // Pointer to CPU numbers
    float *numbers;
    // Pointer to GPU numbers
    float *dev_numbers;
    // Counter
    int i = 0;

    // Randomize
    srand(time(0));

    // Timers
    // Kernel timers
    cudaEvent_t start_kernel, stop_kernel;
    float elapsedTime_kernel;
    HANDLE_ERROR(cudaEventCreate(&start_kernel));
    HANDLE_ERROR(cudaEventCreate(&stop_kernel));
    // cudaMalloc timers
    cudaEvent_t start_malloc, stop_malloc;
    float elapsedTime_malloc;
    HANDLE_ERROR(cudaEventCreate(&start_malloc));
    HANDLE_ERROR(cudaEventCreate(&stop_malloc));
    // CPU timers
    struct timeval start, stop;
    float elapsedTime = 0;
    /*****************VARIABLE SETUP*****************/


    /*****************CPU ARRAY SETUP*****************/
    // Setup CPU array
    HANDLE_ERROR(cudaHostAlloc((void**)&numbers, ARRAYSIZE * sizeof(float), cudaHostAllocDefault));
    for(i = 0; i < ARRAYSIZE; i++)
        numbers[i] = RandomReal(0, 50000.0);
    /*****************CPU ARRAY SETUP*****************/


    /*****************GPU ARRAY SETUP*****************/
    // Start recording cuda malloc time
    HANDLE_ERROR(cudaEventRecord(start_malloc,0));

    // Allocate memory to GPU
    HANDLE_ERROR(cudaMalloc((void**)&dev_numbers, ARRAYSIZE * sizeof(float)));
    // Transfer CPU array to GPU
    HANDLE_ERROR(cudaMemcpy(dev_numbers, numbers, ARRAYSIZE*sizeof(float), cudaMemcpyHostToDevice));
    // An array to temporarily store maximum values on the GPU
    float *dev_max;
    HANDLE_ERROR(cudaMalloc((void**)&dev_max, GRIDSIZE * sizeof(float)));
    // An array to hold grab the GPU max
    float maxes[GRIDSIZE];
    /*****************GPU ARRAY SETUP*****************/

    /*****************KERNEL EXECUTION*****************/
    // Start recording kernel execution time
    HANDLE_ERROR(cudaEventRecord(start_kernel,0));
    // Run kernel
    MaxReduction<<<GRIDSIZE, BLOCKSIZE, SIZEFLOAT*BLOCKSIZE>>> (ARRAYSIZE, dev_numbers, dev_max);
    // Transfer maxes over
    HANDLE_ERROR(cudaMemcpy(maxes, dev_max, GRIDSIZE * sizeof(float), cudaMemcpyDeviceToHost));
    // Print out the max
    cout << GetMax(maxes, GRIDSIZE) << endl;

    // Stop recording kernel execution time
    HANDLE_ERROR(cudaEventRecord(stop_kernel,0));
    HANDLE_ERROR(cudaEventSynchronize(stop_kernel));
    // Retrieve recording data
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_kernel, start_kernel, stop_kernel));
    // Stop recording cuda malloc time
    HANDLE_ERROR(cudaEventRecord(stop_malloc,0));
    HANDLE_ERROR(cudaEventSynchronize(stop_malloc));
    // Retrieve recording data
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_malloc, start_malloc, stop_malloc));
    // Print results
    printf("%5.3f\t%5.3f\n", elapsedTime_kernel,  elapsedTime_malloc);
    /*****************KERNEL EXECUTION*****************/


    /*****************CPU EXECUTION*****************/
    // Capture the start time
    gettimeofday(&start, NULL);
    // Call generic P7Viterbi function
    cout << GetMax(numbers, ARRAYSIZE) << endl;
    // Capture the stop time
    gettimeofday(&stop, NULL);
    // Retrieve time elapsed in milliseconds
    long int elapsed_sec = stop.tv_sec - start.tv_sec;
    long int elapsed_usec = stop.tv_usec - start.tv_usec;
    elapsedTime = (float)(1000.0f * elapsed_sec) + (float)(0.001f * elapsed_usec);
    // Print results
    printf("%5.3f\n", elapsedTime);
    /*****************CPU EXECUTION*****************/

    // Free memory
    cudaFreeHost(numbers);
    cudaFree(dev_numbers);
    cudaFree(dev_max);
    cudaEventDestroy(start_kernel);
    cudaEventDestroy(stop_kernel);
    cudaEventDestroy(start_malloc);
    cudaEventDestroy(stop_malloc);

    // Exit program
    return 0;
}

I ran cuda-memcheck on this test program, with -g & -G switches on, and it reports 0 problems. Can anyone spot the issue?

NOTE: Be sure to have book.h from the CUDA by Example book in your current directory when you compile the program. Source link here: http://developer.nvidia.com/cuda-example-introduction-general-purpose-gpu-programming Download the source code, and book.h will be under the common directory/folder.

@talonmies Sorry, should have mentioned, I'm using the Tesla C1060. Although, at home I do have two 560 Ti's. — sj755, Sep 25 '11 at 20:56

talonmies · Accepted Answer · 2011-09-26T09:15:58.650

Your kernel looks broken to me. The thread local search (before the shared memory reduction), should look something like this:

sdata[tid] = g_idata[i];
i += gridSize;

while (i < n) {
    sdata[tid] = MaxOf2(sdata[tid], g_idata[i]);
    i += gridSize;
}

shouldn't it?

Also note that if you run this on Fermi, the shared memory buffer should be declared volatile, and you will get a noticeable improvement in performance if the thread local search is done with a register variable, rather than in shared memory. There is about an 8 times difference in effective bandwidth between the two.

EDIT: Here is a simplified, working version of your reduction kernel. You should note a number of differences compared to your original:

__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
    extern __shared__ volatile float sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(BLOCKSIZE) + tid;
    unsigned int gridSize = BLOCKSIZE*gridDim.x;

    float val = g_idata[i];
    i += gridSize;
    while (i < n) {
        val = MaxOf2(g_idata[i],val);
        i += gridSize;
    }
    sdata[tid] = val;
    __syncthreads();

    // This versions uses a single warp for the shared memory 
    // reduction
# pragma unroll
    for(int i=(tid+32); ((tid<32)&&(i<BLOCKSIZE)); i+=32)
        sdata[tid] = MaxOf2(sdata[tid], sdata[i]);

    if (tid < 16) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
    if (tid < 8)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
    if (tid < 4)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
    if (tid < 2)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
    if (tid == 0) g_odata[blockIdx.x] = MaxOf2(sdata[tid], sdata[tid + 1]);
}

This code should also be safe on Fermi. You should also familiarise yourself with the CUDA math library, because there is a fmax(x,y) intrinsic which you should use in place of your MaxOf2 function.

I think both of our codes achieve ultimately access the same indices of the array, although yours definitely makes much more sense. I compiled the code, not the right result. Any chance its how I setup my code? — sj755, Sep 25 '11 at 21:22
@seljuq70: No, it is the kernel code which is wrong, in two different ways. You have two different sources of out of bounds memory access in the code, on in global memory, one in shared memory. I have updated my answer with a working implementation for you to study. — talonmies, Sep 26 '11 at 06:38
It works, however, there are are issues with your code. It assumes that the size of the array is larger than the number of threads. This results in memory access violations from global memory, an it will result in the values being compared to junk values. Your code works for the case of 16384 though, so thanks!!! — sj755, Sep 27 '11 at 00:24
You got what you paid for. And the problem isn't in my code, it is in *your* host code. Any sensible implementation have host side code which would check the size of the input data set and size the grid accordingly, and if the input data set very small, then just perform the reduction on the host. — talonmies, Sep 27 '11 at 02:27
You're right, question is, in what case do we use the gpu or the cpu? Anyway, doesn't matter, thanks again. — sj755, Sep 27 '11 at 06:21
@seljuq70: you might find [this SO question](http://stackoverflow.com/questions/5810447/cuda-block-and-grid-size-efficiencies/5810780#5810780) worth looking at. — talonmies, Sep 27 '11 at 06:42

CUDA Maximum Reduction Algorithm Not Working

1 Answers1

Linked