I'm trying to perform an inclusive scan to find the cumulative sum of an array. Following the advice given by harrism here, I'm using the procedure given here, but following the advice of those authors, I'm trying to write code that has each thread calculate 4 elements instead of one to mask memory latency.
I am staying away from thrust as performance is essential, and I need multi-stream capability. I have only just discovered CUB, and that will be my next effort, but I would like a multi-block solution and would also like to know where I've gone wrong on my existing code, just as an exercise to better understand CUDA.
The code below allocates 4 data elements to each block, where each block must have a multiple of 32 threads. My data will have a multiple of 128 threads so this restriction is acceptable to me. Enough shared memory is allocated to each block for the 4*blockDim.x
elements plus an additional 32 elements to sum between warps. scanBlockAnyLength
then adds the necessary offset to correct mismatch between warps, saving the final value of each warp to dev_blockSum
in device global memory. sumWarp4_32
then scans this array to find the final to correct the mismatch between blocks, which is then added on in kernel_sumBlock
#include<cuda.h>
#include<iostream>
using std::cout;
using std::endl;
#define MAX_THREADS 1024
#define MAX_BLOCKS 65536
#define N 512
__device__ float sumWarp4_128(float* ptr, const int tidx = threadIdx.x) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 threads per warp
unsigned int i = warpid*128+lane; //first element of block data set this thread looks at
if( lane >= 1 ) ptr[i] += ptr[i-1];
if( lane >= 2 ) ptr[i] += ptr[i-2];
if( lane >= 4 ) ptr[i] += ptr[i-4];
if( lane >= 8 ) ptr[i] += ptr[i-8];
if( lane >= 16 ) ptr[i] += ptr[i-16];
if( lane==0 ) ptr[i+32] += ptr[i+31];
if( lane >= 1 ) ptr[i+32] += ptr[i+32-1];
if( lane >= 2 ) ptr[i+32] += ptr[i+32-2];
if( lane >= 4 ) ptr[i+32] += ptr[i+32-4];
if( lane >= 8 ) ptr[i+32] += ptr[i+32-8];
if( lane >= 16 ) ptr[i+32] += ptr[i+32-16];
if( lane==0 ) ptr[i+64] += ptr[i+63];
if( lane >= 1 ) ptr[i+64] += ptr[i+64-1];
if( lane >= 2 ) ptr[i+64] += ptr[i+64-2];
if( lane >= 4 ) ptr[i+64] += ptr[i+64-4];
if( lane >= 8 ) ptr[i+64] += ptr[i+64-8];
if( lane >= 16 ) ptr[i+64] += ptr[i+64-16];
if( lane==0 ) ptr[i+96] += ptr[i+95];
if( lane >= 1 ) ptr[i+96] += ptr[i+96-1];
if( lane >= 2 ) ptr[i+96] += ptr[i+96-2];
if( lane >= 4 ) ptr[i+96] += ptr[i+96-4];
if( lane >= 8 ) ptr[i+96] += ptr[i+96-8];
if( lane >= 16 ) ptr[i+96] += ptr[i+96-16];
return ptr[i+96];
}
__host__ __device__ float sumWarp4_32(float* ptr, const int tidx = threadIdx.x) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 elements per warp
unsigned int i = warpid*32+lane; //first element of block data set this thread looks at
if( lane >= 1 ) ptr[i] += ptr[i-1];
if( lane >= 2 ) ptr[i] += ptr[i-2];
if( lane >= 4 ) ptr[i] += ptr[i-4];
if( lane >= 8 ) ptr[i] += ptr[i-8];
if( lane >= 16 ) ptr[i] += ptr[i-16];
return ptr[i];
}
__device__ float sumBlock4(float* ptr, const int tidx = threadIdx.x, const int bdimx = blockDim.x ) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 threads per warp
float val = sumWarp4_128(ptr);
__syncthreads();//should be included
if( tidx==bdimx-1 ) ptr[4*bdimx+warpid] = val;
__syncthreads();
if( warpid==0 ) sumWarp4_32((float*)&ptr[4*bdimx]);
__syncthreads();
if( warpid>0 ) {
ptr[warpid*128+lane] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+32] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+64] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+96] += ptr[4*bdimx+warpid-1];
}
__syncthreads();
return ptr[warpid*128+lane+96];
}
__device__ void scanBlockAnyLength4(float *ptr, float* dev_blockSum, const float* dev_input, float* dev_output, const int idx = threadIdx.x, const int bdimx = blockDim.x, const int bidx = blockIdx.x) {
const unsigned int lane = idx & 31;
const unsigned int warpid = idx >> 5;
ptr[lane+warpid*128] = dev_input[lane+warpid*128+bdimx*bidx*4];
ptr[lane+warpid*128+32] = dev_input[lane+warpid*128+bdimx*bidx*4+32];
ptr[lane+warpid*128+64] = dev_input[lane+warpid*128+bdimx*bidx*4+64];
ptr[lane+warpid*128+96] = dev_input[lane+warpid*128+bdimx*bidx*4+96];
__syncthreads();
float val = sumBlock4(ptr);
__syncthreads();
dev_blockSum[0] = 0.0f;
if( idx==0 ) dev_blockSum[bidx+1] = ptr[bdimx*4-1];
dev_output[lane+warpid*128+bdimx*bidx*4] = ptr[lane+warpid*128];
dev_output[lane+warpid*128+bdimx*bidx*4+32] = ptr[lane+warpid*128+32];
dev_output[lane+warpid*128+bdimx*bidx*4+64] = ptr[lane+warpid*128+64];
dev_output[lane+warpid*128+bdimx*bidx*4+96] = ptr[lane+warpid*128+96];
__syncthreads();
}
__global__ void kernel_sumBlock(float* dev_blockSum, const float* dev_input, float* dev_output ) {
extern __shared__ float ptr[];
scanBlockAnyLength4(ptr,dev_blockSum,dev_input,dev_output);
}
__global__ void kernel_offsetBlocks(float* dev_blockSum, float* dev_arr) {
const int tidx = threadIdx.x;
const int bidx = blockIdx.x;
const int bdimx = blockDim.x;
const int lane = tidx & 31;
const int warpid = tidx >> 5;
if( warpid==0 ) sumWarp4_32(dev_blockSum);
float val = dev_blockSum[warpid];
dev_arr[warpid*128+lane] += val;
dev_arr[warpid*128+lane+32] += val;
dev_arr[warpid*128+lane+64] += val;
dev_arr[warpid*128+lane+96] += val;
}
void scan4( const float input[], float output[]) {
int blocks = 2;
int threadsPerBlock = 64; //multiple of 32
int smemsize = (threadsPerBlock*4+32)*sizeof(float);
float* dev_input, *dev_output;
cudaMalloc((void**)&dev_input,blocks*threadsPerBlock*4*sizeof(float));
cudaMalloc((void**)&dev_output,blocks*threadsPerBlock*4*sizeof(float));
float *dev_blockSum;
cudaMalloc((void**)&dev_blockSum,blocks*sizeof(float));
int offset = 0;
int Nrem = N;
int chunksize;
while( Nrem ) {
chunksize = max(Nrem,blocks*threadsPerBlock*4);
cudaMemcpy(dev_input,(void**)&input[offset],chunksize*sizeof(float),cudaMemcpyHostToDevice);
kernel_sumBlock<<<blocks,threadsPerBlock,smemsize>>>(dev_blockSum,dev_input,dev_output);
kernel_offsetBlocks<<<blocks,threadsPerBlock>>>(dev_blockSum,dev_output);
cudaMemcpy((void**)&output[offset],dev_output,chunksize*sizeof(float),cudaMemcpyDeviceToHost);
offset += chunksize;
Nrem -= chunksize;
}
cudaFree(dev_input);
cudaFree(dev_output);
}
int main() {
float h_vec[N], sol[N];
for( int i = 0; i < N; i++ ) h_vec[i] = (float)i+1.0f;
scan4(h_vec,sol);
cout << "solution:" << endl;
for( int i = 0; i < N; i++ ) cout << i << " " << (i+2)*(i+1)/2 << " " << sol[i] << endl;
return 0;
}
To my eye, the code is throwing errors because the lines in sumWarp4_128
are not executed in order within a warp. I.e, the if( lane==0 )
lines are executing before the other logical blocks that precede it. I thought this was not possible within a warp.
If I __syncthreads()
before and after the lane==0
calls, I get some new exotic error that I just can't figure out.
Any help to point out where I've gone wrong would be appreciated