4
#include <algorithm>
#include <vector>
template <typename Dtype>
    __global__ void R_D_CUT(const int n, Dtype* r, Dtype* d
        , Dtype cur_r_max, Dtype cur_r_min, Dtype cur_d_max, Dtype cur_d_min) {
        CUDA_KERNEL_LOOP(index, n) {
            r[index] = __min(cur_r_max, __max(r[index], cur_r_min));
            d[index] = __min(cur_d_max, __max(d[index], cur_d_min));
        }
    }

In above code, it can work well in Window. However, it does not work in Ubuntu due to __min and __max function. To fix it by replace __min to std::min<Dtype> and max to std::max<Dtype>:

template <typename Dtype>
    __global__ void R_D_CUT(const int n, Dtype* r, Dtype* d
        , Dtype cur_r_max, Dtype cur_r_min, Dtype cur_d_max, Dtype cur_d_min) {
        CUDA_KERNEL_LOOP(index, n) {

            r[index] = std::min<Dtype>(cur_r_max, std::max<Dtype>(r[index], cur_r_min));
            d[index] = std::min<Dtype>(cur_d_max, std::max<Dtype>(d[index], cur_d_min));
        }
    }

However, when I recompile, I got the error

_layer.cu(7): error: calling a __host__ function("std::min<float> ") from a __global__ function("caffe::R_D_CUT<float> ") is not allowed

_layer.cu(7): error: calling a __host__ function("std::max<float> ") from a __global__ function("caffe::R_D_CUT<float> ") is not allowed

_layer_layer.cu(8): error: calling a __host__ function("std::min<float> ") from a __global__ function("caffe::R_D_CUT<float> ") is not allowed

_layer_layer.cu(8): error: calling a __host__ function("std::max<float> ") from a __global__ function("caffe::R_D_CUT<float> ") is not allowed

_layer_layer.cu(7): error: calling a __host__ function("std::min<double> ") from a __global__ function("caffe::R_D_CUT<double> ") is not allowed

_layer_layer.cu(7): error: calling a __host__ function("std::max<double> ") from a __global__ function("caffe::R_D_CUT<double> ") is not allowed

_layer_layer.cu(8): error: calling a __host__ function("std::min<double> ") from a __global__ function("caffe::R_D_CUT<double> ") is not allowed

_layer_layer.cu(8): error: calling a __host__ function("std::max<double> ") from a __global__ function("caffe::R_D_CUT<double> ") is not allowed

Could you help me to fix it? Thanks

John
  • 2,838
  • 7
  • 36
  • 65

2 Answers2

8

Generally speaking, functionality associated with std:: is not available in CUDA device code (__global__ or __device__ functions).

Instead, for many math functions, NVIDIA provides a CUDA math library.

For this case, as @njuffa points out, CUDA provides templated/overloaded versions of min and max. So you should just be able to use min() or max() in device code, assuming the type usage corresponds to one of the available templated/overloaded types. Also, you should:

#include <math.h>

Here is a simple worked example showing usage of min() for both float and double type:

$ cat t381.cu
#include <math.h>
#include <stdio.h>

template <typename T>
__global__ void mymin(T d1, T d2){

  printf("min is :%f\n", min(d1,d2));
}


int main(){

  mymin<<<1,1>>>(1.0, 2.0);
  mymin<<<1,1>>>(3.0f, 4.0f);
  cudaDeviceSynchronize();
}
$ nvcc -arch=sm_52 -o t381 t381.cu
$ ./t381
min is :1.000000
min is :3.000000
$

Note that the available overloaded options even include some integer types

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • 1
    I am fairly certain CUDA has templated versions of `min()` and `max()`. Or at least overloaded ones. – njuffa Aug 03 '17 at 15:26
  • Thanks. It worked well. I just have one more question. In case of non CPU code (.CPP file), I also need to use max and min function as above, but when I used your way, it has error `_layer.cpp:205:25: note: ‘max’ declared here, later in the translation unit`. Do we need any change in .cpp version – John Aug 04 '17 at 06:36
  • I would probably need to see a specific example. You might want to ask a new question, I think this now has nothing to do with CUDA. Are you declaring your own definition of `max`? It's not clear to me that what you are showing is actually an error anyway. – Robert Crovella Aug 04 '17 at 13:32
  • Please check it in here https://gist.github.com/John1231983/dd06a20a003f26fb30560ece71225b68#file-batch_renorm_layer-cpp-L203 . I must used std::max to solve it – John Aug 05 '17 at 16:35
2

Adding to @RobertCrovella's answer: If you want something which behaves more like std::max, you can use this templated wrapper over CUDA's math library:

#define __df__ __device__ __forceinline__
template <typename T> __df__ T maximum(T x, T y);
template <> __df__ int                 maximum<int               >(int x, int y)                               { return max(x,y);    }
template <> __df__ unsigned int        maximum<unsigned          >(unsigned int x, unsigned int y)             { return umax(x,y);   }
template <> __df__ long                maximum<long              >(long x, long y)                             { return llmax(x,y);  }
template <> __df__ unsigned long       maximum<unsigned long     >(unsigned long x, unsigned long y)           { return ullmax(x,y); }
template <> __df__ long long           maximum<long long         >(long long x, long long y)                   { return llmax(x,y);  }
template <> __df__ unsigned long long  maximum<unsigned long long>(unsigned long long x, unsigned long long y) { return ullmax(x,y); }
template <> __df__ float               maximum<float             >(float x, float y)                           { return fmaxf(x,y);  }
template <> __df__ double              maximum<double            >(double x, double y)                         { return fmax(x,y);   }
#undef __df__

(see here for a more complete set of these wrappers.)

einpoklum
  • 118,144
  • 57
  • 340
  • 684