CUDA shuffle instruction reduction slower than shared memory reduction?

Question

Shuffle instruction based warp reduction is expected to perform faster reduction than reduction using shared memory or global memory, as mentioned in Faster Parallel Reductions on Kepler and CUDA Pro Tip: Do The Kepler Shuffle

In the following code, I tried to validate this:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>


__inline__ __device__
float warpReduceSum(float val) {
    for (int offset = 16; offset > 0; offset /= 2)
        val += __shfl_down(val, offset);
    return val;
}

__inline__ __device__
float blockReduceSum(float val) {
    static __shared__ int shared[32];
    int lane = threadIdx.x%32;
    int wid = threadIdx.x / 32;
    val = warpReduceSum(val);

    //write reduced value to shared memory
    if (lane == 0) shared[wid] = val;
    __syncthreads();

    //ensure we only grab a value from shared memory if that warp existed
    val = (threadIdx.x<blockDim.x / 32) ? shared[lane] : int(0);
    if (wid == 0) val = warpReduceSum(val);

    return val;
}

__global__ void device_reduce_stable_kernel(float *in, float* out, int N) {
    float sum = int(0);
    //printf("value = %d ", blockDim.x*gridDim.x);
    for (int i = blockIdx.x*blockDim.x + threadIdx.x; i<N; i += blockDim.x*gridDim.x) {
        sum += in[i];
    }
    sum = blockReduceSum(sum);
    if (threadIdx.x == 0)
        out[blockIdx.x] = sum;
}

void device_reduce_stable(float *in, float* out, int N) {
    //int threads = 512;
    //int blocks = min((N + threads - 1) / threads, 1024);
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = N / maxThreadsPerBlock;
    device_reduce_stable_kernel << <blocks, threads >> >(in, out, N);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        printf("Error: %s\n", cudaGetErrorString(err));

    device_reduce_stable_kernel << <1, 1024 >> >(out, out, blocks);
    //cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        printf("Error: %s\n", cudaGetErrorString(err));
}

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid = threadIdx.x;

    // do reduction in global mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            d_in[myId] += d_in[myId + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = d_in[myId];
    }
}

__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
    // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
    extern __shared__ float sdata[];

    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid = threadIdx.x;

    // load shared mem from global mem
    sdata[tid] = d_in[myId];
    __syncthreads();            // make sure entire block is loaded!

    // do reduction in shared mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = sdata[0];
    }
}

void reduce(float * d_out, float * d_intermediate, float * d_in,
    int size, bool usesSharedMemory)
{
    // assumes that size is not greater than maxThreadsPerBlock^2
    // and that size is a multiple of maxThreadsPerBlock
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = size / maxThreadsPerBlock;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
            (d_intermediate, d_in);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    else
    {
        global_reduce_kernel << <blocks, threads >> >
            (d_intermediate, d_in);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    // now we're down to one block left, so reduce it
    threads = blocks; // launch one thread for each block in prev step
    blocks = 1;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
            (d_out, d_intermediate);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    else
    {
        global_reduce_kernel << <blocks, threads >> >
            (d_out, d_intermediate);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
}

int main()
{
    /*int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        fprintf(stderr, "error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }
    int dev = 0;
    cudaSetDevice(dev);

    cudaDeviceProp devProps;
    if (cudaGetDeviceProperties(&devProps, dev) == 0)
    {
        printf("Using device %d:\n", dev);
        printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
            devProps.name, (int)devProps.totalGlobalMem,
            (int)devProps.major, (int)devProps.minor,
            (int)devProps.clockRate);
    }
*/
    const int ARRAY_SIZE = 2048;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // generate the input array on the host
    float h_in[ARRAY_SIZE];
    float sum = 0.0f;
    for (int i = 0; i < ARRAY_SIZE; i++) {
        // generate random float in [-1.0f, 1.0f]
        h_in[i] = i;
        sum += h_in[i];
    }

    // declare GPU memory pointers
    float * d_in, *d_intermediate, *d_out;

    // allocate GPU memory
    cudaMalloc((void **)&d_in, ARRAY_BYTES);
    cudaMalloc((void **)&d_intermediate, ARRAY_BYTES); // overallocated
    cudaMalloc((void **)&d_out, sizeof(float));

    // transfer the input array to the GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

    int whichKernel = 2;
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    // launch the kernel
    cudaProfilerStart();
    switch (whichKernel) {
    case 0:
        printf("Running global reduce\n");
        cudaEventRecord(start, 0);
        //for (int i = 0; i < 100; i++)
        //{
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
        //}
        cudaEventRecord(stop, 0);
        break;
    case 1:
        printf("Running reduce with shared mem\n");
        cudaEventRecord(start, 0);
        //for (int i = 0; i < 100; i++)
        //{
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
        //}
        cudaEventRecord(stop, 0);
        break;
    case 2:
        printf("Running reduce with shuffle instruction\n");
        cudaEventRecord(start, 0);
        /*for (int i = 0; i < 100; i++)
        {*/
            device_reduce_stable(d_in, d_out, ARRAY_SIZE);
        //}
        cudaEventRecord(stop, 0);
        break;
    default:
        fprintf(stderr, "error: ran no kernel\n");
        exit(EXIT_FAILURE);
    }
    cudaProfilerStop();
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    elapsedTime /= 100.0f;      // 100 trials

    // copy back the sum from GPU
    float h_out;
    cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    printf("average time elapsed: %f\n", elapsedTime);

    // free GPU memory allocation
    cudaFree(d_in);
    cudaFree(d_intermediate);
    cudaFree(d_out);

    return 0;
}

The results showed that warp based reduction took nearly twice the time of shared memory based reduction. These results contradict the behavior expected.
The experiment was performed on Tesla K40c with Compute capability higher than 3.0.

When I run your code with a more sensible input size, I get the opposite result. The shuffle based reduction is about 50% faster than the shared memory reduction — talonmies, May 31 '17 at 08:54
I did the same experiment in the past. My results where, that for larger input sizes both methods are identically fast and limited only by memory bandwith. — dari, May 31 '17 at 10:32
When I run your code as-is with `cuda-memcheck` it reports out-of-bounds access errors. There is little point in doing performance analysis with code that is broken. Any time you are going to ask for help with a CUDA code, it is good practice to **first** do [proper CUDA error checking](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) and also to run your code with `cuda-memcheck`. — Robert Crovella, May 31 '17 at 16:23
after fixing that issue in your code, I don't see any meaningful difference in the time reported for shuffle vs. shared mem. Make sure you are building a **Release** project not a **Debug** project. If I build a **Debug** project, I see the 2:1 ratio of speed between shuffle:shared mem. — Robert Crovella, May 31 '17 at 17:20

score 2 · Answer 1 · answered Mar 15 '18 at 18:25

I'm comparing the following two reduction kernels, one using only shared memory WITHOUT using warp shuffling for the last warp reduction stage (version4) and one using shared memory AND warp shuffling for the last warp reduction stage (version5).

version4

template <class T>
__global__ void version4(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid = threadIdx.x;                                 // --- Local thread index
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;   // --- Global thread index - Fictitiously double the block dimension

    // --- Performs the first level of reduction in registers when reading from global memory. 
    T mySum = (i < N) ? g_idata[i] : 0;
    if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
    sdata[tid] = mySum;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
    {
        if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
    if (tid < 32) {
        sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads();
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

version5

template <class T>
__global__ void version5(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid = threadIdx.x;                                 // --- Local thread index
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;   // --- Global thread index - Fictitiously double the block dimension

    // --- Performs the first level of reduction in registers when reading from global memory. 
    T mySum = (i < N) ? g_idata[i] : 0;
    if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
    sdata[tid] = mySum;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
    {
        if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Single warp reduction by shuffle operations
    if (tid < 32)
    {
        // --- Last iteration removed from the for loop, but needed for shuffle reduction
        mySum += sdata[tid + 32];
        // --- Reduce final warp using shuffle
        //for (int offset = warpSize / 2; offset > 0; offset /= 2) mySum += __shfl_down_sync(0xffffffff, mySum, offset);
        for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor_sync(0xffffffff, mySum, i);
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

I confirm that there is no sensitive difference between the two. On my GTX920M card, the timing have been the following:

N = 33554432
version4 = 27.5ms
version5 = 27.095ms

So, I'm confirming Robert's comment above.

CUDA shuffle instruction reduction slower than shared memory reduction?

1 Answers1