diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 342ed3a6b19e244de3f088e105bedb0f2b04d29a..8cced5cd919f24af620972ed544b98016e64d26e 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -24,7 +24,7 @@ struct CudaReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); // relu(x) = max(x, 0) - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { return x > zero ? x : zero; } }; @@ -34,7 +34,7 @@ struct CudaReluGradFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); // dx = dout * (out > 0) - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return out > zero ? dout : zero; } @@ -51,7 +51,7 @@ struct CudaLeakyReluFunctor : public BaseActivationFunctor { } // leakyrelu(x) = x > 0 ? x : alpha * x - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { return x > zero ? x : static_cast(alpha) * x; } }; @@ -66,7 +66,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { } // dx = dout * (x > 0 ? 1 : alpha) - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { return x > zero ? dout : static_cast(alpha) * dout; } @@ -79,7 +79,7 @@ struct CudaSigmoidFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // sigmoid(x) = 1 / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(one / (one + exp(-x))); } @@ -90,7 +90,7 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout * out * (1 - out) - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return dout * out * (one - out); } @@ -103,7 +103,7 @@ struct CudaSiluFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // silu(x) = x / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(x / (one + exp(-x))); } @@ -115,8 +115,8 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); MPType temp = one / (one + exp(-x)); @@ -135,7 +135,7 @@ struct CudaLogSigmoidFunctor : public BaseActivationFunctor { // For numerical stability, // logsigmoid(x) = // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); MPType temp = x > zero ? zero : -x; return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); @@ -151,8 +151,8 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { // For numerical stability: // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, // 0))) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); MPType temp1 = x > zero ? zero : -x; @@ -168,7 +168,7 @@ struct CudaAtanFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // atan(x) = atan(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(atan(x)); } @@ -179,7 +179,7 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout / (1 + x^2) - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { return dout / (one + x * x); } @@ -197,7 +197,7 @@ struct CudaSoftShrinkFunctor : public BaseActivationFunctor { // softshrink(x) = x - lambda, if x > lambda; // x + lambda, if x < -lambda; // 0, otherwise. - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { T l = static_cast(lambda); T temp1 = static_cast(x > l); T temp2 = static_cast(x < -l); @@ -215,7 +215,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { } // dx = dout, if x > lambda or x < -lambda else 0 - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { T l = static_cast(lambda); return (x >= -l && x <= l) ? zero : dout; } @@ -228,7 +228,7 @@ struct CudaCeilFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // ceil(x) = ceil(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(ceil(x)); } @@ -239,7 +239,7 @@ struct CudaFloorFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // floor(x) = floor(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(floor(x)); } @@ -250,7 +250,7 @@ struct CudaRoundFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // round(x) = round(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(round(x)); } @@ -259,7 +259,7 @@ struct CudaRoundFunctor : public BaseActivationFunctor { // GradFunctor for ceil, floor and round template struct CudaZeroGradFunctor : public BaseActivationFunctor { - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { return static_cast(0.0f); } @@ -271,7 +271,7 @@ struct CudaCosFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // cos(x) = cos(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(cos(x)); } @@ -282,8 +282,8 @@ struct CudaCosGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * (-sin(x)) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(-dout * sin(x)); @@ -297,7 +297,7 @@ struct CudaSinFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // sin(x) = sin(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(sin(x)); } @@ -308,8 +308,8 @@ struct CudaSinGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * cos(x) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout * cos(x)); @@ -323,7 +323,7 @@ struct CudaTanFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // tan(x) = tan(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(tan(x)); } @@ -334,8 +334,8 @@ struct CudaTanGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout / cos(x)^2 - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout / (cos(x) * cos(x))); @@ -349,7 +349,7 @@ struct CudaAsinFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // asin(x) = asin(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(asin(x)); } @@ -361,8 +361,8 @@ struct CudaAsinGradFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // dx = dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout / sqrt(one - x * x)); @@ -376,7 +376,7 @@ struct CudaAcosFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // acos(x) = acos(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(acos(x)); } @@ -388,8 +388,8 @@ struct CudaAcosGradFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // dx = -dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(-dout / sqrt(one - x * x)); @@ -403,7 +403,7 @@ struct CudaCoshFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // cosh(x) = cosh(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(cosh(x)); } @@ -414,8 +414,8 @@ struct CudaCoshGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * sinh(x) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout * sinh(x)); @@ -429,7 +429,7 @@ struct CudaSinhFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // sinh(x) = sinh(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(sinh(x)); } @@ -440,8 +440,8 @@ struct CudaSinhGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * cosh(x) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout * cosh(x)); @@ -455,7 +455,7 @@ struct CudaTanhFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // tanh(x) = tanh(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(tanh(x)); } @@ -466,7 +466,7 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout * (1 - out^2) - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return dout * (one - out * out); } @@ -478,7 +478,7 @@ struct CudaAcoshFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // Acosh(x) = acosh(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(acosh(x)); } @@ -489,8 +489,8 @@ struct CudaAcoshGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; MPType one = static_cast(1.0f); // dx = dout * 1 / sqrt(x^2 - 1) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout * one / sqrt(x * x - one)); @@ -504,7 +504,7 @@ struct CudaAsinhFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // Asinh(x) = asinh(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(asinh(x)); } @@ -516,8 +516,8 @@ struct CudaAsinhGradFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // dx = dout * 1/sqrt(x^2 + 1) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout * one / sqrt(x * x + one)); @@ -531,7 +531,7 @@ struct CudaAtanhFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // Atanh(x) = atanh(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(atanh(x)); } @@ -542,8 +542,8 @@ struct CudaAtanhGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; MPType one = static_cast(1.0f); // dx = dout * 1/(1- x^2) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout * one / (one - x * x)); @@ -557,13 +557,13 @@ struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // reciprocal(x) = 1 / x - __device__ __forceinline__ T operator()(const T& x) const { return one / x; } + __device__ __forceinline__ T operator()(const T x) const { return one / x; } }; template struct CudaReciprocalGradFunctor : public BaseActivationFunctor { // dx = -dout * out^2 - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return -dout * out * out; } @@ -575,7 +575,7 @@ struct CudaExpFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // exp(x) = exp(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(exp(x)); } @@ -584,7 +584,7 @@ struct CudaExpFunctor : public BaseActivationFunctor { template struct CudaExpGradFunctor : public BaseActivationFunctor { // dx = dout * out - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return dout * out; } @@ -596,7 +596,7 @@ struct CudaExpm1Functor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // expm1(x) = expm1(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(expm1(x)); } @@ -605,7 +605,7 @@ struct CudaExpm1Functor : public BaseActivationFunctor { template struct CudaExpm1GradFunctor : public BaseActivationFunctor { // dx = dout * out - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return dout * out + dout; } @@ -617,7 +617,7 @@ struct CudaLogFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // log(x) = log(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(log(x)); } @@ -626,7 +626,7 @@ struct CudaLogFunctor : public BaseActivationFunctor { template struct CudaLogGradFunctor : public BaseActivationFunctor { // dx = dout / x - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { return dout / x; } @@ -636,7 +636,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor { template struct CudaSquareFunctor : public BaseActivationFunctor { // square(x) = x * x - __device__ __forceinline__ T operator()(const T& x) const { return x * x; } + __device__ __forceinline__ T operator()(const T x) const { return x * x; } }; template @@ -644,7 +644,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor { T two = static_cast(2.0f); // dx = dout * 2 * x - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { return dout * two * x; } @@ -656,7 +656,7 @@ struct CudaSqrtFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // sqrt(x) = sqrt(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(sqrt(x)); } @@ -667,7 +667,7 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { T one_half = static_cast(0.5f); // dx = dout * 0.5 / out - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return one_half * dout / out; } @@ -679,7 +679,7 @@ struct CudaRsqrtFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // rsqrt(x) = rsqrt(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(rsqrt(x)); } @@ -690,7 +690,7 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor { T minus_one_half = static_cast(-0.5f); // dx = -0.5 * dout * out^3 - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return minus_one_half * dout * out * out * out; } @@ -703,7 +703,7 @@ struct CudaLog1pFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // log1p(x) = log(1 + x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(log(one + x)); } @@ -714,7 +714,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout / (1 + x) - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { return dout / (one + x); } @@ -726,7 +726,7 @@ struct CudaLog2Functor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // log2(x) = log2(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(log2(x)); } @@ -738,7 +738,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor { T log_two = static_cast(log(static_cast(2.0f))); // dx = dout / (x * log(2)) - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { return dout / (x * log_two); } @@ -750,7 +750,7 @@ struct CudaLog10Functor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // log10(x) = log10(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(log10(x)); } @@ -762,7 +762,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { T log_ten = static_cast(log(static_cast(10.0f))); // dx = dout / (x * log(10)) - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { return dout / (x * log_ten); } @@ -779,7 +779,7 @@ struct CudaBReluFunctor : public BaseActivationFunctor { } // brelu(x) = min(max(x, t_min), t_max) - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { T t_min_cast = static_cast(t_min); T t_max_cast = static_cast(t_max); T temp_max = x > t_min_cast ? x : t_min_cast; @@ -799,7 +799,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor { } // dx = (x > t_min && x < t_max) ? dout : 0 - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { T t_min_cast = static_cast(t_min); T t_max_cast = static_cast(t_max); return (x > t_min_cast && x < t_max_cast) ? dout : zero; @@ -820,7 +820,7 @@ struct CudaSoftReluFunctor : public BaseActivationFunctor { // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold))) // threshold should not be negative - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); MPType t = static_cast(threshold); MPType temp_min = x < t ? x : t; @@ -841,8 +841,8 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0 // threshold should not be negative - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_out) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_out) const { MPType dout = static_cast(arg_dout); MPType out = static_cast(arg_out); MPType t = static_cast(threshold); @@ -864,7 +864,7 @@ struct CudaSTanhFunctor : public BaseActivationFunctor { } // stanh(x) = b * tanh(a * x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); MPType a = static_cast(scale_a); MPType b = static_cast(scale_b); @@ -884,8 +884,8 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor { } // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x)) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); MPType a = static_cast(scale_a); @@ -909,7 +909,7 @@ struct CudaSoftplusFunctor : public BaseActivationFunctor { } // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); MPType b = static_cast(beta); MPType t = static_cast(threshold); @@ -930,8 +930,8 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor { } // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x)) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); MPType b = static_cast(beta); @@ -948,7 +948,7 @@ struct CudaSoftsignFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // softsign(x) = x / (1 + abs(x)) - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { return x / (one + abs(x)); } }; @@ -958,7 +958,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout / (1 + abs(x))^2 - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { T temp = one + abs(x); return dout / (temp * temp); } @@ -976,7 +976,7 @@ struct CudaRelu6Functor : public BaseActivationFunctor { } // relu6(x) = min(max(0, x), 6) - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { T t = static_cast(threshold); return x <= zero ? zero : (x < t ? x : t); } @@ -992,7 +992,7 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { } // dx = (out > 0 && out < t) ? dout : 0 - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { T t = static_cast(threshold); return (out > zero && out < t) ? dout : zero; } @@ -1005,7 +1005,7 @@ struct CudaTanhShrinkFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // tanhshrink(x) = x - tanh(x) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); return static_cast(x - tanh(x)); } @@ -1016,8 +1016,8 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * tanh(x)^2 - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); return static_cast(dout * tanh(x) * tanh(x)); @@ -1036,7 +1036,7 @@ struct CudaHardShrinkFunctor : public BaseActivationFunctor { } // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { T t = static_cast(threshold); return (x > -t && x < t) ? zero : x; } @@ -1052,7 +1052,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { } // dx = (x > -threshold && x < threshold) ? 0 : dout - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { T t = static_cast(threshold); return (x > -t && x < t) ? zero : dout; } @@ -1074,7 +1074,7 @@ struct CudaHardSigmoidFunctor : public BaseActivationFunctor { // hard_sigmoid(x) = 0, when x <= -3 // 1, when x >= 3 // x * slope + offset, otherwise - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { T temp = x * static_cast(slope) + static_cast(offset); T temp_max = temp > zero ? temp : zero; T temp_min = temp_max < one ? temp_max : one; @@ -1094,7 +1094,7 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { } // dx = (out > 0 && out < 1) ? dout * slope : 0 - __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + __device__ __forceinline__ T operator()(const T dout, const T out) const { return (out > zero && out < one) ? dout * static_cast(slope) : zero; } @@ -1112,7 +1112,7 @@ struct CudaSwishFunctor : public BaseActivationFunctor { } // swish(x) = x / (1 + exp(-beta * x)) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); MPType b = static_cast(beta); return static_cast(x / (one + exp(-b * x))); @@ -1130,8 +1130,8 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { } // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2) - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); MPType b = static_cast(beta); @@ -1159,7 +1159,7 @@ struct CudaMishFunctor : public BaseActivationFunctor { // softplus(x) = x, if x > threshold // = ln(1 + exp(x)), otherwise // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { MPType x = static_cast(arg_x); MPType sp = (x > static_cast(threshold)) ? x : log(one + exp(x)); return static_cast(x * tanh(sp)); @@ -1180,8 +1180,8 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { // sp = softplus(x) // Inputs: args[0], the input dout // args[1], the input x - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); MPType sp = (x > static_cast(threshold)) ? x : log(one + exp(x)); @@ -1204,7 +1204,7 @@ struct CudaThresholdedReluFunctor : public BaseActivationFunctor { } // thresholded_relu(x) = x > threshold ? x : 0 - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { return x > static_cast(threshold) ? x : zero; } }; @@ -1219,7 +1219,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { } // dx = x > threshold ? dout : 0 - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { return x > static_cast(threshold) ? dout : zero; } @@ -1241,7 +1241,7 @@ struct CudaHardSwishFunctor : public BaseActivationFunctor { // x , when x >= threshold - offset // x * (x + offset) / scale, otherwise // threshold = scale = 6, offset = 3 by default - __device__ __forceinline__ T operator()(const T& x) const { + __device__ __forceinline__ T operator()(const T x) const { T t = static_cast(threshold); T temp = x + static_cast(offset); T temp_max = temp > zero ? temp : zero; @@ -1267,7 +1267,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { // dout , when x >= threshold - offset // dout * (2 * x / scale + offset / scale), otherwise // threshold = scale = 6, offset = 3 by default - __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + __device__ __forceinline__ T operator()(const T dout, const T x) const { T o = static_cast(offset); T s = static_cast(scale); T temp1 = static_cast(x + o > zero); @@ -1291,7 +1291,7 @@ struct CudaELUFunctor : public BaseActivationFunctor { // elu(x) = x, if x > 0 // elu(x) = alpha * (e^x - 1), if x <= 0 - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { CT x = static_cast(arg_x); CT temp = static_cast(alpha) * (exp(x) - one); CT res = x > zero ? x : temp; @@ -1312,8 +1312,7 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { // case 1: alpha >= 0 // dx = dout, if out > 0 // dx = dout * (out + alpha), if out <= 0 - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_out) const { + __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const { MPType dout = static_cast(arg_dout); MPType out = static_cast(arg_out); MPType a = static_cast(alpha); @@ -1338,8 +1337,8 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { // case 2: alpha < 0 // dx = dout, if x > 0 // dx = dout * (out + alpha), if x <=0 - __device__ __forceinline__ T operator()(const T& arg_dout, const T& arg_out, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType out = static_cast(arg_out); MPType x = static_cast(arg_x); @@ -1393,7 +1392,7 @@ struct CudaCELUFunctor : public BaseActivationFunctor { } // celu(x) = max(0, x) + min(0, alpha * (exp(x/alpha) - 1)) - __device__ __forceinline__ T operator()(const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_x) const { CT x = static_cast(arg_x); CT temp = static_cast(alpha) * (exp(x / static_cast(alpha)) - one); CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp); @@ -1416,8 +1415,8 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor { // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0 // dx = dout , if alpha < 0 and x > 0 // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0 - __device__ __forceinline__ T operator()(const T& arg_dout, - const T& arg_x) const { + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { MPType dout = static_cast(arg_dout); MPType x = static_cast(arg_x); MPType a = static_cast(alpha);