0

I have modified the int version of vector add to two complex vector to add, below code can work, but I am confused:

#include <stdio.h>
#include <complex>

#define N (2048*2048)
#define THREADS_PER_BLOCK 512

__global__ void add(std::complex<double> *a, std::complex<double> *b, std::complex<double> *c)
{
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  // c[index] = a[index] + b[index];
  // c[index] = a[index].real();
  c[index] = a[index];
}


int main()
{
    // host side
    std::complex<double> *a;
    std::complex<double> *b;
    std::complex<double> *c;

    // device side
    std::complex<double> *d_a;
    std::complex<double> *d_b;
    std::complex<double> *d_c;

    int size = N * sizeof(std::complex<double>);

/* allocate space for device copies of a, b, c */

  cudaMalloc( (void **) &d_a, size );
  cudaMalloc( (void **) &d_b, size );
  cudaMalloc( (void **) &d_c, size );

/* allocate space for host copies of a, b, c and setup input values */

  a = (std::complex<double>*)malloc( size );
  b = (std::complex<double>*)malloc( size );
  c = (std::complex<double>*)malloc( size );

  for( int i = 0; i < N; i++ )
  {
    a[i] = b[i] = i;
    c[i] = 0;
  }


  cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );
  cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );

  add<<< std::ceil(N / (double)THREADS_PER_BLOCK), THREADS_PER_BLOCK>>>( d_a, d_b, d_c );
  cudaDeviceSynchronize();


  cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost);

  bool success = true;
  for( int i = 0; i < N; i++ )
  {
    // if( c[i] != a[i] + b[i])
    if( c[i] != a[i] )
    {
      printf("c[%d] = %d\n",i,c[i] );
      success = false;
      break;
    }
  }

  printf("%s\n", success ? "success" : "fail");

  free(a);
  free(b);
  free(c);
  cudaFree( d_a );
  cudaFree( d_b );
  cudaFree( d_c );

  return 0;
}

for the kernel function:

__global__ void add(std::complex<double> *a, std::complex<double> *b, std::complex<double> *c)
    {
      int index = threadIdx.x + blockIdx.x * blockDim.x;
      // c[index] = a[index] + b[index];
      // c[index] = a[index].real();
      c[index] = a[index];
    }

Line

c[index] = a[index];

will call std::complex operator =, this can pass compile, but when change to compile with line:

c[index] = a[index] + b[index]; // first one
c[index] = a[index].real();     // second one

It will just cannot compile, the error message for the first one is:

complex.cu(10): error: calling a host function("std::operator + ") from a global function("add") is not allowed

complex.cu(10): error: identifier "std::operator + " is undefined in device code

the error message when change to use the second one is like:

complex.cu(11): error: calling a constexpr host function("real") from a global function("add") is not allowed. The experimental flag '--expt-relaxed-constexpr' can be used to allow this.

1 error detected in the compilation of "/tmp/tmpxft_000157af_00000000-8_complex.cpp1.ii".

The compile command I used:

/usr/local/cuda-10.2/bin/nvcc -o complex complex.cu

I knew well that device code cannot call host code, and real() and + function for std::complex is both host code, so they cannot be called in kernel function, however, I am not understand why std::complex operator = can pass compile in my kernel function?

update: After overload the operator+ for std::complex, above code can achieve desired result:

__host__ __device__ std::complex<double> operator+(const std::complex<double>& a, const std::complex<double>& b)

{

    const double* aArg = reinterpret_cast<const double*>(&a);

    const double* bArg = reinterpret_cast<const double*>(&b);

    double retVal[2] = { aArg[0] + bArg[0], aArg[1] + bArg[1] };

    return std::move(*reinterpret_cast<std::complex<double>*>(retVal));
 
}

The root cause is that the underline struct of std::complex is in fact a array of 2 data types you defined, like double[2], the benefit is that we can have same function parameter at host/device side. However, I still recommand to use thrust/complex or other complex library in CUDA.

BruceSun
  • 144
  • 9
  • 1
    the compiler is using a "implicitly declared copy assignment operator", see [here](https://en.cppreference.com/w/cpp/language/copy_assignment). Similar hijinks are not possible with operator +, for example. – Robert Crovella Mar 10 '21 at 14:23
  • Related, but not quite a dupe: [this question](https://stackoverflow.com/questions/17473826/cuda-how-to-work-with-complex-numbers). – einpoklum Mar 10 '21 at 16:08

1 Answers1

1

No, CUDA C++ does not implement std::complex<T>::operator+() as a built-in.

The std::complex<T> type is not implemented for the GPU; all of its methods are written host-only. Exceptions are constexpr methods, and as @RobertCrovella noted, the compiler willingness to treat some/all implicitly-declared methods as __host__ __device__ - e.g. copy constructors or assignment operators . This is why c[index] = a[index] works: It uses an implicitly-defined assignment operator.

For using complex numbers on the device side, consider this question:

CUDA - How to work with complex numbers?

einpoklum
  • 118,144
  • 57
  • 340
  • 684
  • Thanks for answering. I checked that std::complex will have explicit [operator =](https://en.cppreference.com/w/cpp/numeric/complex/operator%3D) since C++20, is that mean that when from C++ 20, my sample code will also fail? Also, after checking the intermediate code generated by nvcc compiler as below: `template< class _Tp> std::complex< float> & # 1165 "/usr/include/c++/8/complex" 3 operator=(const std::complex< _Tp> &__z) } ` also, it have some sin/cos function, is this host/code or device code? – BruceSun Mar 11 '21 at 01:05
  • @BruceSun: Yes, your sample code will likely fail if you compile it with `--std=c++20`; but - that's not possible until CUDA completes support for the new standard. – einpoklum Mar 11 '21 at 09:05