CUDA no operator += for volatile cuda::std::complex

Question

I have a kernel that uses cuda::std::complex<float>, and in this kernel I want to do warp reduction, following this post.

The warpReduce function:

template <typename T, unsigned int blockSize>
__device__ void warpReduce(volatile T *sdata, unsigned int tid) {
    if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
    if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
    if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
    if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
    if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
    if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}

I'm getting the error: error : no operator "+=" matches these operands, operand types are: volatile cuda::std::complex<float> += volatile cuda::std::complex<float>. Simply removing the volatile as mentioned in this post doesnt work. Is there any way I can still use a complex type (thrust/cuda::std) in warp reduction?

kernel

template <unsigned int blockSize>
__global__ void reduce6(cuda::std::complex<float>*g_idata, cuda::std::complex<float>*g_odata, unsigned int n) {
    extern __shared__ int sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2) + tid;
    unsigned int gridSize = blockSize*2*gridDim.x;
    sdata[tid] = 0;
    while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
    __syncthreads();
    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
    if (tid < 32) warpReduce<cuda::std::complex<float>, blockSize>(sdata, tid);
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

I found a workaround by doing a reinterpret cast to float2/double2 first. But I dont know if this has any other implications. I read about undefined behaviour. Other suggestions?

This works:

template <typename T>
struct myComplex;

template <>
struct myComplex<float>
{
    typedef float2 type;
};

template <>
struct myComplex<double>
{
    typedef double2 type;
};

template <typename T>
__device__ void warpReduce(volatile T *SharedData, int tid)
{
    SharedData[tid].x += SharedData[tid + 32].x;
    SharedData[tid].x += SharedData[tid + 16].x;
    SharedData[tid].x += SharedData[tid + 8].x;
    SharedData[tid].x += SharedData[tid + 4].x;
    SharedData[tid].x += SharedData[tid + 2].x;
    SharedData[tid].x += SharedData[tid + 1].x;

    SharedData[tid].y += SharedData[tid + 32].y;
    SharedData[tid].y += SharedData[tid + 16].y;
    SharedData[tid].y += SharedData[tid + 8].y;
    SharedData[tid].y += SharedData[tid + 4].y;
    SharedData[tid].y += SharedData[tid + 2].y;
    SharedData[tid].y += SharedData[tid + 1].y;
}

// and then in the kernel:
warpReduce(reinterpret_cast<typename myComplex<T>::type *>(data), tid);

Robert Crovella · Accepted Answer · 2022-07-07T21:17:30.050

According to my testing, in CUDA 11.7, the issue revolves around the use of volatile.

According to this blog, this style of programming (implicit warp-synchronous) is deprecated.

additionally, this part of your posted code could not possibly be correct:

extern __shared__ int sdata[];

Combining these ideas, we can do the following:

$ cat t7.cu
#include <cuda/std/complex>
#include <iostream>
// assumes blocksize is 64 or larger power of 2 up to max of 512 (or 1024 see below)
template <typename T>
__device__ void warpReduce(T *sdata, unsigned int tid) {

  T v = sdata[tid]; 
  v += sdata[tid+32]; 
  sdata[tid] = v;     __syncwarp();
  v += sdata[tid+16]; __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v += sdata[tid+8];  __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v += sdata[tid+4];  __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v += sdata[tid+2];  __syncwarp();
  sdata[tid] = v;     __syncwarp();
  v += sdata[tid+1];  __syncwarp();
  sdata[tid] = v;
}

template <unsigned int blockSize, typename T>
__global__ void reduce6(T *g_idata, T *g_odata, size_t n) {
    extern __shared__ T sdata[];
    unsigned int tid = threadIdx.x;
    size_t i = blockIdx.x*(blockSize*2) + tid;
    size_t gridSize = blockSize*2*gridDim.x;
    sdata[tid] = 0;
    while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
    __syncthreads();
//  if (blockSize == 1024) { if (tid < 512) { sdata[tid] += sdata[tid + 512]; } __syncthreads(); }  // uncomment to support blocksize of 1024
    if (blockSize >= 512)  { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
    if (blockSize >= 256)  { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
    if (blockSize >= 128)  { if (tid < 64)  { sdata[tid] += sdata[tid + 64];  } __syncthreads(); }
    if (tid < 32) warpReduce(sdata, tid);
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

using my_t = cuda::std::complex<float>;
int main(){
  size_t n = 2048;
  const unsigned blk = 128;
  unsigned grid = n/blk/2;
  my_t *i, *h_i;
  my_t *o, *h_o;
  h_i = new my_t[n];
  h_o = new my_t[grid];
  for (size_t i = 0; i < n; i++) h_i[i] = {1,1};
  cudaMalloc(&i, n*sizeof(my_t));
  cudaMalloc(&o, grid*sizeof(my_t));
  cudaMemcpy(i, h_i, n*sizeof(my_t), cudaMemcpyHostToDevice);
  reduce6<blk><<<grid,blk, blk*sizeof(my_t)>>>(i, o, n);
  cudaMemcpy(h_o, o, grid*sizeof(my_t), cudaMemcpyDeviceToHost);
  for (int i = 0; i < grid; i++)
    std::cout << cuda::std::real(h_o[i]) << "," <<  cuda::std::imag(h_o[i]) <<  std::endl;
  cudaDeviceSynchronize();
}
$ nvcc -o t7 t7.cu
$ compute-sanitizer ./t7
========= COMPUTE-SANITIZER
256,256
256,256
256,256
256,256
256,256
256,256
256,256
256,256
========= ERROR SUMMARY: 0 errors
$

CUDA no operator += for volatile cuda::std::complex

1 Answers1

Linked