diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 342ed3a6b19e244de3f088e105bedb0f2b04d29a..8cced5cd919f24af620972ed544b98016e64d26e 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -24,7 +24,7 @@ struct CudaReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
 
   // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return x > zero ? x : zero;
   }
 };
@@ -34,7 +34,7 @@ struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
 
   // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return out > zero ? dout : zero;
   }
 
@@ -51,7 +51,7 @@ struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
   }
 
   // leakyrelu(x) = x > 0 ? x : alpha * x
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return x > zero ? x : static_cast<T>(alpha) * x;
   }
 };
@@ -66,7 +66,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = dout * (x > 0 ? 1 : alpha)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return x > zero ? dout : static_cast<T>(alpha) * dout;
   }
 
@@ -79,7 +79,7 @@ struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // sigmoid(x) = 1 / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(one / (one + exp(-x)));
   }
@@ -90,7 +90,7 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout * out * (1 - out)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return dout * out * (one - out);
   }
 
@@ -103,7 +103,7 @@ struct CudaSiluFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // silu(x) = x / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(x / (one + exp(-x)));
   }
@@ -115,8 +115,8 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType temp = one / (one + exp(-x));
@@ -135,7 +135,7 @@ struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
   // For numerical stability,
   // logsigmoid(x) =
   //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType temp = x > zero ? zero : -x;
     return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
@@ -151,8 +151,8 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
   // For numerical stability:
   // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
   // 0)))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType temp1 = x > zero ? zero : -x;
@@ -168,7 +168,7 @@ struct CudaAtanFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // atan(x) = atan(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(atan(x));
   }
@@ -179,7 +179,7 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / (one + x * x);
   }
 
@@ -197,7 +197,7 @@ struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
   // softshrink(x) = x - lambda, if x > lambda;
   //                 x + lambda, if x < -lambda;
   //                 0, otherwise.
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T l = static_cast<T>(lambda);
     T temp1 = static_cast<T>(x > l);
     T temp2 = static_cast<T>(x < -l);
@@ -215,7 +215,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = dout, if x > lambda or x < -lambda else 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T l = static_cast<T>(lambda);
     return (x >= -l && x <= l) ? zero : dout;
   }
@@ -228,7 +228,7 @@ struct CudaCeilFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // ceil(x) = ceil(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(ceil(x));
   }
@@ -239,7 +239,7 @@ struct CudaFloorFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // floor(x) = floor(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(floor(x));
   }
@@ -250,7 +250,7 @@ struct CudaRoundFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // round(x) = round(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(round(x));
   }
@@ -259,7 +259,7 @@ struct CudaRoundFunctor : public BaseActivationFunctor<T> {
 // GradFunctor for ceil, floor and round
 template <typename T>
 struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return static_cast<T>(0.0f);
   }
 
@@ -271,7 +271,7 @@ struct CudaCosFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(cos(x));
   }
@@ -282,8 +282,8 @@ struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(-dout * sin(x));
@@ -297,7 +297,7 @@ struct CudaSinFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(sin(x));
   }
@@ -308,8 +308,8 @@ struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * cos(x));
@@ -323,7 +323,7 @@ struct CudaTanFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(tan(x));
   }
@@ -334,8 +334,8 @@ struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout / (cos(x) * cos(x)));
@@ -349,7 +349,7 @@ struct CudaAsinFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(asin(x));
   }
@@ -361,8 +361,8 @@ struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout / sqrt(one - x * x));
@@ -376,7 +376,7 @@ struct CudaAcosFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(acos(x));
   }
@@ -388,8 +388,8 @@ struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(-dout / sqrt(one - x * x));
@@ -403,7 +403,7 @@ struct CudaCoshFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(cosh(x));
   }
@@ -414,8 +414,8 @@ struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * sinh(x));
@@ -429,7 +429,7 @@ struct CudaSinhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(sinh(x));
   }
@@ -440,8 +440,8 @@ struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * cosh(x));
@@ -455,7 +455,7 @@ struct CudaTanhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // tanh(x) = tanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(tanh(x));
   }
@@ -466,7 +466,7 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout * (1 - out^2)
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return dout * (one - out * out);
   }
 
@@ -478,7 +478,7 @@ struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(acosh(x));
   }
@@ -489,8 +489,8 @@ struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
   MPType one = static_cast<MPType>(1.0f);
   // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * one / sqrt(x * x - one));
@@ -504,7 +504,7 @@ struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(asinh(x));
   }
@@ -516,8 +516,8 @@ struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // dx = dout * 1/sqrt(x^2 + 1)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * one / sqrt(x * x + one));
@@ -531,7 +531,7 @@ struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // Atanh(x) = atanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(atanh(x));
   }
@@ -542,8 +542,8 @@ struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
   MPType one = static_cast<MPType>(1.0f);
   // dx = dout * 1/(1- x^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * one / (one - x * x));
@@ -557,13 +557,13 @@ struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // reciprocal(x) = 1 / x
-  __device__ __forceinline__ T operator()(const T& x) const { return one / x; }
+  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
 };
 
 template <typename T>
 struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
   // dx = -dout * out^2
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return -dout * out * out;
   }
 
@@ -575,7 +575,7 @@ struct CudaExpFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // exp(x) = exp(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(exp(x));
   }
@@ -584,7 +584,7 @@ struct CudaExpFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
   // dx = dout * out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return dout * out;
   }
 
@@ -596,7 +596,7 @@ struct CudaExpm1Functor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // expm1(x) = expm1(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(expm1(x));
   }
@@ -605,7 +605,7 @@ struct CudaExpm1Functor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
   // dx = dout * out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return dout * out + dout;
   }
 
@@ -617,7 +617,7 @@ struct CudaLogFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // log(x) = log(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(log(x));
   }
@@ -626,7 +626,7 @@ struct CudaLogFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
   // dx = dout / x
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / x;
   }
 
@@ -636,7 +636,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaSquareFunctor : public BaseActivationFunctor<T> {
   // square(x) = x * x
-  __device__ __forceinline__ T operator()(const T& x) const { return x * x; }
+  __device__ __forceinline__ T operator()(const T x) const { return x * x; }
 };
 
 template <typename T>
@@ -644,7 +644,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
   T two = static_cast<T>(2.0f);
 
   // dx = dout * 2 * x
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout * two * x;
   }
 
@@ -656,7 +656,7 @@ struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // sqrt(x) = sqrt(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(sqrt(x));
   }
@@ -667,7 +667,7 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
   T one_half = static_cast<T>(0.5f);
 
   // dx = dout * 0.5 / out
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return one_half * dout / out;
   }
 
@@ -679,7 +679,7 @@ struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // rsqrt(x) = rsqrt(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(rsqrt(x));
   }
@@ -690,7 +690,7 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
   T minus_one_half = static_cast<T>(-0.5f);
 
   // dx = -0.5 * dout * out^3
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return minus_one_half * dout * out * out * out;
   }
 
@@ -703,7 +703,7 @@ struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
   MPType one = static_cast<MPType>(1.0f);
 
   // log1p(x) = log(1 + x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(log(one + x));
   }
@@ -714,7 +714,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout / (1 + x)
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / (one + x);
   }
 
@@ -726,7 +726,7 @@ struct CudaLog2Functor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // log2(x) = log2(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(log2(x));
   }
@@ -738,7 +738,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
   T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
 
   // dx = dout / (x * log(2))
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / (x * log_two);
   }
 
@@ -750,7 +750,7 @@ struct CudaLog10Functor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // log10(x) = log10(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(log10(x));
   }
@@ -762,7 +762,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
   T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
 
   // dx = dout / (x * log(10))
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return dout / (x * log_ten);
   }
 
@@ -779,7 +779,7 @@ struct CudaBReluFunctor : public BaseActivationFunctor<T> {
   }
 
   // brelu(x) = min(max(x, t_min), t_max)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T t_min_cast = static_cast<T>(t_min);
     T t_max_cast = static_cast<T>(t_max);
     T temp_max = x > t_min_cast ? x : t_min_cast;
@@ -799,7 +799,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = (x > t_min && x < t_max) ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T t_min_cast = static_cast<T>(t_min);
     T t_max_cast = static_cast<T>(t_max);
     return (x > t_min_cast && x < t_max_cast) ? dout : zero;
@@ -820,7 +820,7 @@ struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
 
   // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold)))
   // threshold should not be negative
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType t = static_cast<MPType>(threshold);
     MPType temp_min = x < t ? x : t;
@@ -841,8 +841,8 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
 
   // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0
   // threshold should not be negative
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_out) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_out) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType out = static_cast<MPType>(arg_out);
     MPType t = static_cast<MPType>(threshold);
@@ -864,7 +864,7 @@ struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
   }
 
   // stanh(x) = b * tanh(a * x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType a = static_cast<MPType>(scale_a);
     MPType b = static_cast<MPType>(scale_b);
@@ -884,8 +884,8 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType a = static_cast<MPType>(scale_a);
@@ -909,7 +909,7 @@ struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
   }
 
   // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
     MPType t = static_cast<MPType>(threshold);
@@ -930,8 +930,8 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
@@ -948,7 +948,7 @@ struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // softsign(x) = x / (1 + abs(x))
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return x / (one + abs(x));
   }
 };
@@ -958,7 +958,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
   // dx = dout / (1 + abs(x))^2
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T temp = one + abs(x);
     return dout / (temp * temp);
   }
@@ -976,7 +976,7 @@ struct CudaRelu6Functor : public BaseActivationFunctor<T> {
   }
 
   // relu6(x) = min(max(0, x), 6)
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T t = static_cast<T>(threshold);
     return x <= zero ? zero : (x < t ? x : t);
   }
@@ -992,7 +992,7 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = (out > 0 && out < t) ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     T t = static_cast<T>(threshold);
     return (out > zero && out < t) ? dout : zero;
   }
@@ -1005,7 +1005,7 @@ struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // tanhshrink(x) = x - tanh(x)
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(x - tanh(x));
   }
@@ -1016,8 +1016,8 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
 
   // dx = dout * tanh(x)^2
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     return static_cast<T>(dout * tanh(x) * tanh(x));
@@ -1036,7 +1036,7 @@ struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
   }
 
   // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T t = static_cast<T>(threshold);
     return (x > -t && x < t) ? zero : x;
   }
@@ -1052,7 +1052,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = (x > -threshold && x < threshold) ? 0 : dout
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T t = static_cast<T>(threshold);
     return (x > -t && x < t) ? zero : dout;
   }
@@ -1074,7 +1074,7 @@ struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
   // hard_sigmoid(x) = 0, when x <= -3
   //                   1, when x >= 3
   //                   x * slope + offset, otherwise
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
     T temp_max = temp > zero ? temp : zero;
     T temp_min = temp_max < one ? temp_max : one;
@@ -1094,7 +1094,7 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = (out > 0 && out < 1) ? dout * slope : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& out) const {
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
     return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
   }
 
@@ -1112,7 +1112,7 @@ struct CudaSwishFunctor : public BaseActivationFunctor<T> {
   }
 
   // swish(x) = x / (1 + exp(-beta * x))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
     return static_cast<T>(x / (one + exp(-b * x)));
@@ -1130,8 +1130,8 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2)
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
@@ -1159,7 +1159,7 @@ struct CudaMishFunctor : public BaseActivationFunctor<T> {
   // softplus(x) = x, if x > threshold
   //             = ln(1 + exp(x)), otherwise
   // Inputs: args[0], the input x
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
     MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
     return static_cast<T>(x * tanh(sp));
@@ -1180,8 +1180,8 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
   // sp = softplus(x)
   // Inputs: args[0], the input dout
   //         args[1], the input x
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
@@ -1204,7 +1204,7 @@ struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
   }
 
   // thresholded_relu(x) = x > threshold ? x : 0
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     return x > static_cast<T>(threshold) ? x : zero;
   }
 };
@@ -1219,7 +1219,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
   }
 
   // dx = x > threshold ? dout : 0
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     return x > static_cast<T>(threshold) ? dout : zero;
   }
 
@@ -1241,7 +1241,7 @@ struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
   //                 x , when x >= threshold - offset
   //                 x * (x + offset) / scale, otherwise
   // threshold = scale = 6, offset = 3 by default
-  __device__ __forceinline__ T operator()(const T& x) const {
+  __device__ __forceinline__ T operator()(const T x) const {
     T t = static_cast<T>(threshold);
     T temp = x + static_cast<T>(offset);
     T temp_max = temp > zero ? temp : zero;
@@ -1267,7 +1267,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
   //      dout , when x >= threshold - offset
   //      dout * (2 * x / scale + offset / scale), otherwise
   // threshold = scale = 6, offset = 3 by default
-  __device__ __forceinline__ T operator()(const T& dout, const T& x) const {
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
     T o = static_cast<T>(offset);
     T s = static_cast<T>(scale);
     T temp1 = static_cast<T>(x + o > zero);
@@ -1291,7 +1291,7 @@ struct CudaELUFunctor : public BaseActivationFunctor<T> {
 
   // elu(x) = x, if x > 0
   // elu(x) = alpha * (e^x - 1), if x <= 0
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     CT x = static_cast<CT>(arg_x);
     CT temp = static_cast<CT>(alpha) * (exp(x) - one);
     CT res = x > zero ? x : temp;
@@ -1312,8 +1312,7 @@ struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
   // case 1: alpha >= 0
   // dx = dout, if out > 0
   // dx = dout * (out + alpha), if out <= 0
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_out) const {
+  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType out = static_cast<MPType>(arg_out);
     MPType a = static_cast<MPType>(alpha);
@@ -1338,8 +1337,8 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
   // case 2: alpha < 0
   // dx = dout, if x > 0
   // dx = dout * (out + alpha), if x <=0
-  __device__ __forceinline__ T operator()(const T& arg_dout, const T& arg_out,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType out = static_cast<MPType>(arg_out);
     MPType x = static_cast<MPType>(arg_x);
@@ -1393,7 +1392,7 @@ struct CudaCELUFunctor : public BaseActivationFunctor<T> {
   }
 
   // celu(x) = max(0, x) + min(0, alpha * (exp(x/alpha) - 1))
-  __device__ __forceinline__ T operator()(const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     CT x = static_cast<CT>(arg_x);
     CT temp = static_cast<CT>(alpha) * (exp(x / static_cast<CT>(alpha)) - one);
     CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp);
@@ -1416,8 +1415,8 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
   // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0
   // dx = dout , if alpha < 0 and x > 0
   // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0
-  __device__ __forceinline__ T operator()(const T& arg_dout,
-                                          const T& arg_x) const {
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType a = static_cast<MPType>(alpha);