I have a kernel that uses cuda::std::complex<float>
, and in this kernel I want to do warp reduction, following this post.
The warpReduce
function:
template <typename T, unsigned int blockSize>
__device__ void warpReduce(volatile T *sdata, unsigned int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
I'm getting the error: error : no operator "+=" matches these operands, operand types are: volatile cuda::std::complex<float> += volatile cuda::std::complex<float>
.
Simply removing the volatile as mentioned in this post doesnt work.
Is there any way I can still use a complex type (thrust/cuda::std) in warp reduction?
kernel
template <unsigned int blockSize>
__global__ void reduce6(cuda::std::complex<float>*g_idata, cuda::std::complex<float>*g_odata, unsigned int n) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) warpReduce<cuda::std::complex<float>, blockSize>(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
I found a workaround by doing a reinterpret cast to float2/double2 first. But I dont know if this has any other implications. I read about undefined behaviour. Other suggestions?
This works:
template <typename T>
struct myComplex;
template <>
struct myComplex<float>
{
typedef float2 type;
};
template <>
struct myComplex<double>
{
typedef double2 type;
};
template <typename T>
__device__ void warpReduce(volatile T *SharedData, int tid)
{
SharedData[tid].x += SharedData[tid + 32].x;
SharedData[tid].x += SharedData[tid + 16].x;
SharedData[tid].x += SharedData[tid + 8].x;
SharedData[tid].x += SharedData[tid + 4].x;
SharedData[tid].x += SharedData[tid + 2].x;
SharedData[tid].x += SharedData[tid + 1].x;
SharedData[tid].y += SharedData[tid + 32].y;
SharedData[tid].y += SharedData[tid + 16].y;
SharedData[tid].y += SharedData[tid + 8].y;
SharedData[tid].y += SharedData[tid + 4].y;
SharedData[tid].y += SharedData[tid + 2].y;
SharedData[tid].y += SharedData[tid + 1].y;
}
// and then in the kernel:
warpReduce(reinterpret_cast<typename myComplex<T>::type *>(data), tid);