[part 2]change type of function args (#38886)

86434818 · Zhang Ting · GitHub · df5d55bb · 86434818
显示空白变更内容
内联并排

Showing with 111 addition and 112 deletion

paddle/fluid/operators/activation_op.cu paddle/fluid/operators/activation_op.cu +111 -112

未找到文件。
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -24,7 +24,7 @@ struct CudaReluFunctor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);
  // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    return x > zero ? x : zero;
  }
 };
@@ -34,7 +34,7 @@ struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);
  // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return out > zero ? dout : zero;
  }
@@ -51,7 +51,7 @@ struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
  }
  // leakyrelu(x) = x > 0 ? x : alpha * x
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    return x > zero ? x : static_cast<T>(alpha) * x;
  }
 };
@@ -66,7 +66,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = dout * (x > 0 ? 1 : alpha)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    return x > zero ? dout : static_cast<T>(alpha) * dout;
  }
@@ -79,7 +79,7 @@ struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
  MPType one = static_cast<MPType>(1.0f);
  // sigmoid(x) = 1 / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(one / (one + exp(-x)));
  }
@@ -90,7 +90,7 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
  T one = static_cast<T>(1.0f);
  // dx = dout * out * (1 - out)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return dout * out * (one - out);
  }
@@ -103,7 +103,7 @@ struct CudaSiluFunctor : public BaseActivationFunctor<T> {
  MPType one = static_cast<MPType>(1.0f);
  // silu(x) = x / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(x / (one + exp(-x)));
  }
@@ -115,8 +115,8 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
  MPType one = static_cast<MPType>(1.0f);
  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    MPType temp = one / (one + exp(-x));
@@ -135,7 +135,7 @@ struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
  // For numerical stability,
  // logsigmoid(x) =
  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    MPType temp = x > zero ? zero : -x;
    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
@@ -151,8 +151,8 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
  // For numerical stability:
  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
  // 0)))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    MPType temp1 = x > zero ? zero : -x;
@@ -168,7 +168,7 @@ struct CudaAtanFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // atan(x) = atan(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(atan(x));
  }
@@ -179,7 +179,7 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
  T one = static_cast<T>(1.0f);
  // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    return dout / (one + x * x);
  }
@@ -197,7 +197,7 @@ struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
  // softshrink(x) = x - lambda, if x > lambda;
  //                 x + lambda, if x < -lambda;
  //                 0, otherwise.
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    T l = static_cast<T>(lambda);
    T temp1 = static_cast<T>(x > l);
    T temp2 = static_cast<T>(x < -l);
@@ -215,7 +215,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = dout, if x > lambda or x < -lambda else 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    T l = static_cast<T>(lambda);
    return (x >= -l && x <= l) ? zero : dout;
  }
@@ -228,7 +228,7 @@ struct CudaCeilFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // ceil(x) = ceil(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(ceil(x));
  }
@@ -239,7 +239,7 @@ struct CudaFloorFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // floor(x) = floor(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(floor(x));
  }
@@ -250,7 +250,7 @@ struct CudaRoundFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // round(x) = round(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(round(x));
  }
@@ -259,7 +259,7 @@ struct CudaRoundFunctor : public BaseActivationFunctor<T> {
 // GradFunctor for ceil, floor and round
 template <typename T>
 struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    return static_cast<T>(0.0f);
  }
@@ -271,7 +271,7 @@ struct CudaCosFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(cos(x));
  }
@@ -282,8 +282,8 @@ struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(-dout * sin(x));
@@ -297,7 +297,7 @@ struct CudaSinFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(sin(x));
  }
@@ -308,8 +308,8 @@ struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout * cos(x));
@@ -323,7 +323,7 @@ struct CudaTanFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(tan(x));
  }
@@ -334,8 +334,8 @@ struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout / (cos(x) * cos(x)));
@@ -349,7 +349,7 @@ struct CudaAsinFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(asin(x));
  }
@@ -361,8 +361,8 @@ struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
  MPType one = static_cast<MPType>(1.0f);
  // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout / sqrt(one - x * x));
@@ -376,7 +376,7 @@ struct CudaAcosFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(acos(x));
  }
@@ -388,8 +388,8 @@ struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
  MPType one = static_cast<MPType>(1.0f);
  // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(-dout / sqrt(one - x * x));
@@ -403,7 +403,7 @@ struct CudaCoshFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(cosh(x));
  }
@@ -414,8 +414,8 @@ struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout * sinh(x));
@@ -429,7 +429,7 @@ struct CudaSinhFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(sinh(x));
  }
@@ -440,8 +440,8 @@ struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout * cosh(x));
@@ -455,7 +455,7 @@ struct CudaTanhFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // tanh(x) = tanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(tanh(x));
  }
@@ -466,7 +466,7 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
  T one = static_cast<T>(1.0f);
  // dx = dout * (1 - out^2)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return dout * (one - out * out);
  }
@@ -478,7 +478,7 @@ struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(acosh(x));
  }
@@ -489,8 +489,8 @@ struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  MPType one = static_cast<MPType>(1.0f);
  // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout * one / sqrt(x * x - one));
@@ -504,7 +504,7 @@ struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(asinh(x));
  }
@@ -516,8 +516,8 @@ struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
  MPType one = static_cast<MPType>(1.0f);
  // dx = dout * 1/sqrt(x^2 + 1)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout * one / sqrt(x * x + one));
@@ -531,7 +531,7 @@ struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // Atanh(x) = atanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(atanh(x));
  }
@@ -542,8 +542,8 @@ struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  MPType one = static_cast<MPType>(1.0f);
  // dx = dout * 1/(1- x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout * one / (one - x * x));
@@ -557,13 +557,13 @@ struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
  T one = static_cast<T>(1.0f);
  // reciprocal(x) = 1 / x
-  __device__ __forceinline__ T operator()(const T& x) const { return one / x; }
+  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
 };
 template <typename T>
 struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
  // dx = -dout * out^2
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return -dout * out * out;
  }
@@ -575,7 +575,7 @@ struct CudaExpFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // exp(x) = exp(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(exp(x));
  }
@@ -584,7 +584,7 @@ struct CudaExpFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
  // dx = dout * out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return dout * out;
  }
@@ -596,7 +596,7 @@ struct CudaExpm1Functor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // expm1(x) = expm1(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(expm1(x));
  }
@@ -605,7 +605,7 @@ struct CudaExpm1Functor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
  // dx = dout * out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return dout * out + dout;
  }
@@ -617,7 +617,7 @@ struct CudaLogFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // log(x) = log(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(log(x));
  }
@@ -626,7 +626,7 @@ struct CudaLogFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
  // dx = dout / x
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    return dout / x;
  }
@@ -636,7 +636,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaSquareFunctor : public BaseActivationFunctor<T> {
  // square(x) = x * x
-  __device__ __forceinline__ T operator()(const T& x) const { return x * x; }
+  __device__ __forceinline__ T operator()(const T x) const { return x * x; }
 };
 template <typename T>
@@ -644,7 +644,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
  T two = static_cast<T>(2.0f);
  // dx = dout * 2 * x
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    return dout * two * x;
  }
@@ -656,7 +656,7 @@ struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // sqrt(x) = sqrt(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(sqrt(x));
  }
@@ -667,7 +667,7 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
  T one_half = static_cast<T>(0.5f);
  // dx = dout * 0.5 / out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return one_half * dout / out;
  }
@@ -679,7 +679,7 @@ struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // rsqrt(x) = rsqrt(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(rsqrt(x));
  }
@@ -690,7 +690,7 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
  T minus_one_half = static_cast<T>(-0.5f);
  // dx = -0.5 * dout * out^3
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return minus_one_half * dout * out * out * out;
  }
@@ -703,7 +703,7 @@ struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
  MPType one = static_cast<MPType>(1.0f);
  // log1p(x) = log(1 + x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(log(one + x));
  }
@@ -714,7 +714,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
  T one = static_cast<T>(1.0f);
  // dx = dout / (1 + x)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    return dout / (one + x);
  }
@@ -726,7 +726,7 @@ struct CudaLog2Functor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // log2(x) = log2(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(log2(x));
  }
@@ -738,7 +738,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
  // dx = dout / (x * log(2))
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    return dout / (x * log_two);
  }
@@ -750,7 +750,7 @@ struct CudaLog10Functor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // log10(x) = log10(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(log10(x));
  }
@@ -762,7 +762,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
  // dx = dout / (x * log(10))
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    return dout / (x * log_ten);
  }
@@ -779,7 +779,7 @@ struct CudaBReluFunctor : public BaseActivationFunctor<T> {
  }
  // brelu(x) = min(max(x, t_min), t_max)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    T t_min_cast = static_cast<T>(t_min);
    T t_max_cast = static_cast<T>(t_max);
    T temp_max = x > t_min_cast ? x : t_min_cast;
@@ -799,7 +799,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = (x > t_min && x < t_max) ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    T t_min_cast = static_cast<T>(t_min);
    T t_max_cast = static_cast<T>(t_max);
    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
@@ -820,7 +820,7 @@ struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
  // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold)))
  // threshold should not be negative
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    MPType t = static_cast<MPType>(threshold);
    MPType temp_min = x < t ? x : t;
@@ -841,8 +841,8 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
  // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0
  // threshold should not be negative
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_out) const {
+                                          const T arg_out) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType out = static_cast<MPType>(arg_out);
    MPType t = static_cast<MPType>(threshold);
@@ -864,7 +864,7 @@ struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
  }
  // stanh(x) = b * tanh(a * x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    MPType a = static_cast<MPType>(scale_a);
    MPType b = static_cast<MPType>(scale_b);
@@ -884,8 +884,8 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    MPType a = static_cast<MPType>(scale_a);
@@ -909,7 +909,7 @@ struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
  }
  // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    MPType b = static_cast<MPType>(beta);
    MPType t = static_cast<MPType>(threshold);
@@ -930,8 +930,8 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    MPType b = static_cast<MPType>(beta);
@@ -948,7 +948,7 @@ struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
  T one = static_cast<T>(1.0f);
  // softsign(x) = x / (1 + abs(x))
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    return x / (one + abs(x));
  }
 };
@@ -958,7 +958,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
  T one = static_cast<T>(1.0f);
  // dx = dout / (1 + abs(x))^2
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    T temp = one + abs(x);
    return dout / (temp * temp);
  }
@@ -976,7 +976,7 @@ struct CudaRelu6Functor : public BaseActivationFunctor<T> {
  }
  // relu6(x) = min(max(0, x), 6)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    T t = static_cast<T>(threshold);
    return x <= zero ? zero : (x < t ? x : t);
  }
@@ -992,7 +992,7 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = (out > 0 && out < t) ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    T t = static_cast<T>(threshold);
    return (out > zero && out < t) ? dout : zero;
  }
@@ -1005,7 +1005,7 @@ struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // tanhshrink(x) = x - tanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(x - tanh(x));
  }
@@ -1016,8 +1016,8 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  // dx = dout * tanh(x)^2
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    return static_cast<T>(dout * tanh(x) * tanh(x));
@@ -1036,7 +1036,7 @@ struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
  }
  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    T t = static_cast<T>(threshold);
    return (x > -t && x < t) ? zero : x;
  }
@@ -1052,7 +1052,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = (x > -threshold && x < threshold) ? 0 : dout
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    T t = static_cast<T>(threshold);
    return (x > -t && x < t) ? zero : dout;
  }
@@ -1074,7 +1074,7 @@ struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
  // hard_sigmoid(x) = 0, when x <= -3
  //                   1, when x >= 3
  //                   x * slope + offset, otherwise
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
    T temp_max = temp > zero ? temp : zero;
    T temp_min = temp_max < one ? temp_max : one;
@@ -1094,7 +1094,7 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = (out > 0 && out < 1) ? dout * slope : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
  }
@@ -1112,7 +1112,7 @@ struct CudaSwishFunctor : public BaseActivationFunctor<T> {
  }
  // swish(x) = x / (1 + exp(-beta * x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    MPType b = static_cast<MPType>(beta);
    return static_cast<T>(x / (one + exp(-b * x)));
@@ -1130,8 +1130,8 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    MPType b = static_cast<MPType>(beta);
@@ -1159,7 +1159,7 @@ struct CudaMishFunctor : public BaseActivationFunctor<T> {
  // softplus(x) = x, if x > threshold
  //             = ln(1 + exp(x)), otherwise
  // Inputs: args[0], the input x
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    MPType x = static_cast<MPType>(arg_x);
    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
    return static_cast<T>(x * tanh(sp));
@@ -1180,8 +1180,8 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
  // sp = softplus(x)
  // Inputs: args[0], the input dout
  //         args[1], the input x
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
@@ -1204,7 +1204,7 @@ struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
  }
  // thresholded_relu(x) = x > threshold ? x : 0
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    return x > static_cast<T>(threshold) ? x : zero;
  }
 };
@@ -1219,7 +1219,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
  }
  // dx = x > threshold ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    return x > static_cast<T>(threshold) ? dout : zero;
  }
@@ -1241,7 +1241,7 @@ struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
  //                 x , when x >= threshold - offset
  //                 x * (x + offset) / scale, otherwise
  // threshold = scale = 6, offset = 3 by default
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
    T t = static_cast<T>(threshold);
    T temp = x + static_cast<T>(offset);
    T temp_max = temp > zero ? temp : zero;
@@ -1267,7 +1267,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
  //      dout , when x >= threshold - offset
  //      dout * (2 * x / scale + offset / scale), otherwise
  // threshold = scale = 6, offset = 3 by default
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
    T o = static_cast<T>(offset);
    T s = static_cast<T>(scale);
    T temp1 = static_cast<T>(x + o > zero);
@@ -1291,7 +1291,7 @@ struct CudaELUFunctor : public BaseActivationFunctor<T> {
  // elu(x) = x, if x > 0
  // elu(x) = alpha * (e^x - 1), if x <= 0
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    CT x = static_cast<CT>(arg_x);
    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
    CT res = x > zero ? x : temp;
@@ -1312,8 +1312,7 @@ struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
  // case 1: alpha >= 0
  // dx = dout, if out > 0
  // dx = dout * (out + alpha), if out <= 0
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
-                                          const T& arg_out) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType out = static_cast<MPType>(arg_out);
    MPType a = static_cast<MPType>(alpha);
@@ -1338,8 +1337,8 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
  // case 2: alpha < 0
  // dx = dout, if x > 0
  // dx = dout * (out + alpha), if x <=0
-  __device__ __forceinline__ T operator()(const T& arg_dout, const T& arg_out,
+  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType out = static_cast<MPType>(arg_out);
    MPType x = static_cast<MPType>(arg_x);
@@ -1393,7 +1392,7 @@ struct CudaCELUFunctor : public BaseActivationFunctor<T> {
  }
  // celu(x) = max(0, x) + min(0, alpha * (exp(x/alpha) - 1))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
    CT x = static_cast<CT>(arg_x);
    CT temp = static_cast<CT>(alpha) * (exp(x / static_cast<CT>(alpha)) - one);
    CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp);
@@ -1416,8 +1415,8 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
  // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0
  // dx = dout , if alpha < 0 and x > 0
  // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0
-  __device__ __forceinline__ T operator()(const T& arg_dout,
+  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T& arg_x) const {
+                                          const T arg_x) const {
    MPType dout = static_cast<MPType>(arg_dout);
    MPType x = static_cast<MPType>(arg_x);
    MPType a = static_cast<MPType>(alpha);