Shuffle instruction based warp reduction is expected to perform faster reduction than reduction using shared memory or global memory, as mentioned in Faster Parallel Reductions on Kepler and CUDA Pro Tip: Do The Kepler Shuffle
In the following code, I tried to validate this:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32];
int lane = threadIdx.x%32;
int wid = threadIdx.x / 32;
val = warpReduceSum(val);
//write reduced value to shared memory
if (lane == 0) shared[wid] = val;
__syncthreads();
//ensure we only grab a value from shared memory if that warp existed
val = (threadIdx.x<blockDim.x / 32) ? shared[lane] : int(0);
if (wid == 0) val = warpReduceSum(val);
return val;
}
__global__ void device_reduce_stable_kernel(float *in, float* out, int N) {
float sum = int(0);
//printf("value = %d ", blockDim.x*gridDim.x);
for (int i = blockIdx.x*blockDim.x + threadIdx.x; i<N; i += blockDim.x*gridDim.x) {
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x == 0)
out[blockIdx.x] = sum;
}
void device_reduce_stable(float *in, float* out, int N) {
//int threads = 512;
//int blocks = min((N + threads - 1) / threads, 1024);
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = N / maxThreadsPerBlock;
device_reduce_stable_kernel << <blocks, threads >> >(in, out, N);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
device_reduce_stable_kernel << <1, 1024 >> >(out, out, blocks);
//cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// do reduction in global mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
d_in[myId] += d_in[myId + s];
}
__syncthreads(); // make sure all adds at one stage are done!
}
// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
d_out[blockIdx.x] = d_in[myId];
}
}
__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
// sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ float sdata[];
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// load shared mem from global mem
sdata[tid] = d_in[myId];
__syncthreads(); // make sure entire block is loaded!
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads(); // make sure all adds at one stage are done!
}
// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
d_out[blockIdx.x] = sdata[0];
}
}
void reduce(float * d_out, float * d_intermediate, float * d_in,
int size, bool usesSharedMemory)
{
// assumes that size is not greater than maxThreadsPerBlock^2
// and that size is a multiple of maxThreadsPerBlock
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = size / maxThreadsPerBlock;
if (usesSharedMemory)
{
shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
(d_intermediate, d_in);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
else
{
global_reduce_kernel << <blocks, threads >> >
(d_intermediate, d_in);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
// now we're down to one block left, so reduce it
threads = blocks; // launch one thread for each block in prev step
blocks = 1;
if (usesSharedMemory)
{
shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
(d_out, d_intermediate);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
else
{
global_reduce_kernel << <blocks, threads >> >
(d_out, d_intermediate);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
}
}
int main()
{
/*int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (deviceCount == 0) {
fprintf(stderr, "error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, dev) == 0)
{
printf("Using device %d:\n", dev);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
devProps.name, (int)devProps.totalGlobalMem,
(int)devProps.major, (int)devProps.minor,
(int)devProps.clockRate);
}
*/
const int ARRAY_SIZE = 2048;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
float sum = 0.0f;
for (int i = 0; i < ARRAY_SIZE; i++) {
// generate random float in [-1.0f, 1.0f]
h_in[i] = i;
sum += h_in[i];
}
// declare GPU memory pointers
float * d_in, *d_intermediate, *d_out;
// allocate GPU memory
cudaMalloc((void **)&d_in, ARRAY_BYTES);
cudaMalloc((void **)&d_intermediate, ARRAY_BYTES); // overallocated
cudaMalloc((void **)&d_out, sizeof(float));
// transfer the input array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
int whichKernel = 2;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// launch the kernel
cudaProfilerStart();
switch (whichKernel) {
case 0:
printf("Running global reduce\n");
cudaEventRecord(start, 0);
//for (int i = 0; i < 100; i++)
//{
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
//}
cudaEventRecord(stop, 0);
break;
case 1:
printf("Running reduce with shared mem\n");
cudaEventRecord(start, 0);
//for (int i = 0; i < 100; i++)
//{
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
//}
cudaEventRecord(stop, 0);
break;
case 2:
printf("Running reduce with shuffle instruction\n");
cudaEventRecord(start, 0);
/*for (int i = 0; i < 100; i++)
{*/
device_reduce_stable(d_in, d_out, ARRAY_SIZE);
//}
cudaEventRecord(stop, 0);
break;
default:
fprintf(stderr, "error: ran no kernel\n");
exit(EXIT_FAILURE);
}
cudaProfilerStop();
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
elapsedTime /= 100.0f; // 100 trials
// copy back the sum from GPU
float h_out;
cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
printf("average time elapsed: %f\n", elapsedTime);
// free GPU memory allocation
cudaFree(d_in);
cudaFree(d_intermediate);
cudaFree(d_out);
return 0;
}
The results showed that warp based reduction took nearly twice the time of shared memory based reduction. These results contradict the behavior expected.
The experiment was performed on Tesla K40c with Compute capability higher than 3.0.