From 12bf0502d8d0d598fe77f93d9af2992411a63863 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 14 Sep 2021 20:15:44 +0800 Subject: [PATCH] Implement FunctionTraits to support two kinds of elementwise functor and remove some old codes for broadcast. (#35688) --- paddle/fluid/operators/abs_op.cu | 8 +- paddle/fluid/operators/activation_op.cu | 490 +++++++----------- .../fluid/operators/controlflow/bitwise_op.cu | 80 ++- .../operators/controlflow/compare_all_op.cu | 25 +- .../fluid/operators/controlflow/compare_op.cu | 51 +- .../elementwise/elementwise_add_op.cu | 18 +- .../elementwise/elementwise_mul_op.cu | 9 +- .../elementwise/elementwise_op_broadcast.cu.h | 176 +++---- .../elementwise/elementwise_op_function.h | 175 ++----- .../elementwise/elementwise_op_impl.cu.h | 132 +++-- .../elementwise/elementwise_sub_op.cu | 9 +- .../fluid/operators/fused/attn_bias_add.cu.h | 8 +- .../kernel_primitives/compute_primitives.h | 112 ++-- paddle/fluid/operators/lgamma_op.cu | 17 +- paddle/fluid/operators/matrix_rank_op.cu | 15 +- paddle/fluid/operators/svd_helper.h | 20 +- paddle/fluid/platform/function_traits.h | 51 ++ 17 files changed, 601 insertions(+), 795 deletions(-) create mode 100644 paddle/fluid/platform/function_traits.h diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu index b0eba229fde..94b0a3ae729 100644 --- a/paddle/fluid/operators/abs_op.cu +++ b/paddle/fluid/operators/abs_op.cu @@ -24,15 +24,15 @@ struct CudaAbsFunctor; template struct CudaAbsFunctor>> { - __device__ __forceinline__ math::Real operator()(const T* args) const { - return abs(args[0]); + __device__ __forceinline__ math::Real operator()(const T& x) const { + return abs(x); } }; template struct CudaAbsFunctor>> { - __device__ __forceinline__ T operator()(const T* args) const { - return std::abs(args[0]); + __device__ __forceinline__ T operator()(const T& x) const { + return std::abs(x); } }; diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 6c024504791..72f10bf19e7 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -24,9 +24,8 @@ struct CudaReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); // relu(x) = max(x, 0) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] > zero ? args[0] : zero; + __device__ __forceinline__ T operator()(const T& x) const { + return x > zero ? x : zero; } }; @@ -35,10 +34,8 @@ struct CudaReluGradFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); // dx = dout * (out > 0) - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - return args[1] > zero ? args[0] : zero; + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + return out > zero ? dout : zero; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -54,9 +51,8 @@ struct CudaLeakyReluFunctor : public BaseActivationFunctor { } // leakyrelu(x) = x > 0 ? x : alpha * x - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] > zero ? args[0] : static_cast(alpha) * args[0]; + __device__ __forceinline__ T operator()(const T& x) const { + return x > zero ? x : static_cast(alpha) * x; } }; @@ -70,10 +66,8 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { } // dx = dout * (x > 0 ? 1 : alpha) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[1] > zero ? args[0] : static_cast(alpha) * args[0]; + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + return x > zero ? dout : static_cast(alpha) * dout; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -85,9 +79,8 @@ struct CudaSigmoidFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // sigmoid(x) = 1 / (1 + exp(-x)) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(one / (one + exp(-x))); } }; @@ -97,10 +90,8 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout * out * (1 - out) - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] * args[1] * (one - args[1]); + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + return dout * out * (one - out); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -108,14 +99,12 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { template struct CudaSiluFunctor : public BaseActivationFunctor { - // MPType means Compute Type using MPType = typename details::MPTypeTrait::Type; MPType one = static_cast(1.0f); // silu(x) = x / (1 + exp(-x)) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(x / (one + exp(-x))); } }; @@ -126,11 +115,10 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); MPType temp = one / (one + exp(-x)); return static_cast(dout * (temp * (one + x * (one - temp)))); } @@ -147,9 +135,8 @@ struct CudaLogSigmoidFunctor : public BaseActivationFunctor { // For numerical stability, // logsigmoid(x) = // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); MPType temp = x > zero ? zero : -x; return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); } @@ -164,11 +151,10 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { // For numerical stability: // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, // 0))) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); MPType temp1 = x > zero ? zero : -x; MPType temp2 = exp(-x - temp1); return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); @@ -182,9 +168,8 @@ struct CudaAtanFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // atan(x) = atan(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(atan(x)); } }; @@ -194,10 +179,8 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout / (1 + x^2) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] / (one + args[1] * args[1]); + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + return dout / (one + x * x); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -214,9 +197,7 @@ struct CudaSoftShrinkFunctor : public BaseActivationFunctor { // softshrink(x) = x - lambda, if x > lambda; // x + lambda, if x < -lambda; // 0, otherwise. - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T x = args[0]; + __device__ __forceinline__ T operator()(const T& x) const { T l = static_cast(lambda); T temp1 = static_cast(x > l); T temp2 = static_cast(x < -l); @@ -234,12 +215,9 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { } // dx = dout, if x > lambda or x < -lambda else 0 - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T x = args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { T l = static_cast(lambda); - return (x >= -l && x <= l) ? zero : args[0]; + return (x >= -l && x <= l) ? zero : dout; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -250,9 +228,8 @@ struct CudaCeilFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // ceil(x) = ceil(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(ceil(x)); } }; @@ -262,9 +239,8 @@ struct CudaFloorFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // floor(x) = floor(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(floor(x)); } }; @@ -274,17 +250,16 @@ struct CudaRoundFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // round(x) = round(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(round(x)); } }; -// grad functor for ceil, floor and round +// GradFunctor for ceil, floor and round template struct CudaZeroGradFunctor : public BaseActivationFunctor { - __device__ __forceinline__ T operator()(const T* args) const { + __device__ __forceinline__ T operator()(const T& x) const { return static_cast(0.0f); } @@ -296,9 +271,8 @@ struct CudaCosFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // cos(x) = cos(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(cos(x)); } }; @@ -308,11 +282,10 @@ struct CudaCosGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * (-sin(x)) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); return static_cast(-dout * sin(x)); } @@ -324,9 +297,8 @@ struct CudaSinFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // sin(x) = sin(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(sin(x)); } }; @@ -336,11 +308,10 @@ struct CudaSinGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * cos(x) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); return static_cast(dout * cos(x)); } @@ -352,9 +323,8 @@ struct CudaTanFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // tan(x) = tan(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(tan(x)); } }; @@ -364,11 +334,10 @@ struct CudaTanGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout / cos(x)^2 - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); return static_cast(dout / (cos(x) * cos(x))); } @@ -380,9 +349,8 @@ struct CudaAsinFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // asin(x) = asin(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(asin(x)); } }; @@ -393,11 +361,10 @@ struct CudaAsinGradFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // dx = dout / sqrt(1 - x^2) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); return static_cast(dout / sqrt(one - x * x)); } @@ -409,9 +376,8 @@ struct CudaAcosFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // acos(x) = acos(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(acos(x)); } }; @@ -422,11 +388,10 @@ struct CudaAcosGradFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // dx = -dout / sqrt(1 - x^2) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); return static_cast(-dout / sqrt(one - x * x)); } @@ -438,9 +403,8 @@ struct CudaCoshFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // cosh(x) = cosh(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(cosh(x)); } }; @@ -450,11 +414,10 @@ struct CudaCoshGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * sinh(x) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); return static_cast(dout * sinh(x)); } @@ -466,9 +429,8 @@ struct CudaSinhFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // sinh(x) = sinh(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(sinh(x)); } }; @@ -478,11 +440,10 @@ struct CudaSinhGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * cosh(x) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); return static_cast(dout * cosh(x)); } @@ -494,9 +455,8 @@ struct CudaTanhFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // tanh(x) = tanh(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(tanh(x)); } }; @@ -506,11 +466,7 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout * (1 - out^2) - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - T dout = static_cast(args[0]); - T out = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { return dout * (one - out * out); } @@ -522,19 +478,14 @@ struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // reciprocal(x) = 1 / x - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return one / args[0]; - } + __device__ __forceinline__ T operator()(const T& x) const { return one / x; } }; template struct CudaReciprocalGradFunctor : public BaseActivationFunctor { // dx = -dout * out^2 - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - return -args[0] * args[1] * args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + return -dout * out * out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -545,9 +496,8 @@ struct CudaExpFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // exp(x) = exp(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(exp(x)); } }; @@ -555,10 +505,8 @@ struct CudaExpFunctor : public BaseActivationFunctor { template struct CudaExpGradFunctor : public BaseActivationFunctor { // dx = dout * out - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] * args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + return dout * out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -569,9 +517,8 @@ struct CudaExpm1Functor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // expm1(x) = expm1(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(expm1(x)); } }; @@ -579,10 +526,8 @@ struct CudaExpm1Functor : public BaseActivationFunctor { template struct CudaExpm1GradFunctor : public BaseActivationFunctor { // dx = dout * out - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] * args[1] + args[0]; + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + return dout * out + dout; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -593,9 +538,8 @@ struct CudaLogFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // log(x) = log(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(log(x)); } }; @@ -603,10 +547,8 @@ struct CudaLogFunctor : public BaseActivationFunctor { template struct CudaLogGradFunctor : public BaseActivationFunctor { // dx = dout / x - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] / args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + return dout / x; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -615,10 +557,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor { template struct CudaSquareFunctor : public BaseActivationFunctor { // square(x) = x * x - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] * args[0]; - } + __device__ __forceinline__ T operator()(const T& x) const { return x * x; } }; template @@ -626,10 +565,8 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor { T two = static_cast(2.0f); // dx = dout * 2 * x - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] * two * args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + return dout * two * x; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -640,9 +577,8 @@ struct CudaSqrtFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // sqrt(x) = sqrt(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(sqrt(x)); } }; @@ -652,10 +588,8 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { T one_half = static_cast(0.5f); // dx = dout * 0.5 / out - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - return one_half * args[0] / args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + return one_half * dout / out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -666,9 +600,8 @@ struct CudaRsqrtFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // rsqrt(x) = rsqrt(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(rsqrt(x)); } }; @@ -677,12 +610,9 @@ template struct CudaRsqrtGradFunctor : public BaseActivationFunctor { T minus_one_half = static_cast(-0.5f); - // dx = dout * -0.5 / out^3 - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - T out = args[1]; - return minus_one_half * args[0] * out * out * out; + // dx = -0.5 * dout * out^3 + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + return minus_one_half * dout * out * out * out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -694,9 +624,8 @@ struct CudaLog1pFunctor : public BaseActivationFunctor { MPType one = static_cast(1.0f); // log1p(x) = log(1 + x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(log(one + x)); } }; @@ -706,10 +635,8 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout / (1 + x) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] / (one + args[1]); + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + return dout / (one + x); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -720,9 +647,8 @@ struct CudaLog2Functor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // log2(x) = log2(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(log2(x)); } }; @@ -733,10 +659,8 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor { T log_two = static_cast(log(static_cast(2.0f))); // dx = dout / (x * log(2)) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] / (args[1] * log_two); + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + return dout / (x * log_two); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -747,9 +671,8 @@ struct CudaLog10Functor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // log10(x) = log10(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(log10(x)); } }; @@ -760,10 +683,8 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { T log_ten = static_cast(log(static_cast(10.0f))); // dx = dout / (x * log(10)) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] / (args[1] * log_ten); + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + return dout / (x * log_ten); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -779,9 +700,7 @@ struct CudaBReluFunctor : public BaseActivationFunctor { } // brelu(x) = min(max(x, t_min), t_max) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T x = args[0]; + __device__ __forceinline__ T operator()(const T& x) const { T t_min_cast = static_cast(t_min); T t_max_cast = static_cast(t_max); T temp_max = x > t_min_cast ? x : t_min_cast; @@ -801,11 +720,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor { } // dx = (x > t_min && x < t_max) ? dout : 0 - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T dout = args[0]; - T x = args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { T t_min_cast = static_cast(t_min); T t_max_cast = static_cast(t_max); return (x > t_min_cast && x < t_max_cast) ? dout : zero; @@ -825,10 +740,9 @@ struct CudaSoftReluFunctor : public BaseActivationFunctor { } // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold))) - // Inputs: args[0], the input x // threshold should not be negative - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); MPType t = static_cast(threshold); MPType temp_min = x < t ? x : t; MPType temp_max = temp_min > -t ? temp_min : -t; @@ -847,12 +761,11 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { } // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0 - // Inputs: args[0], the input dout - // args[1], the input out // threshold should not be negative - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType out = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_out) const { + MPType dout = static_cast(arg_dout); + MPType out = static_cast(arg_out); MPType t = static_cast(threshold); return (out > -t && out < t) ? static_cast(dout * (one - exp(-out))) : static_cast(0.0f); @@ -872,9 +785,8 @@ struct CudaSTanhFunctor : public BaseActivationFunctor { } // stanh(x) = b * tanh(a * x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); MPType a = static_cast(scale_a); MPType b = static_cast(scale_b); return static_cast(b * tanh(a * x)); @@ -893,11 +805,10 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor { } // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x)) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); MPType a = static_cast(scale_a); MPType b = static_cast(scale_b); MPType temp = tanh(a * x); @@ -919,9 +830,8 @@ struct CudaSoftplusFunctor : public BaseActivationFunctor { } // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); MPType b = static_cast(beta); MPType t = static_cast(threshold); MPType x_beta = x * beta; @@ -941,15 +851,14 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor { } // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x)) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); MPType b = static_cast(beta); MPType t = static_cast(threshold); MPType x_beta = x * beta; - return x_beta > t ? args[0] : static_cast(dout / (one + exp(-x_beta))); + return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -960,9 +869,8 @@ struct CudaSoftsignFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // softsign(x) = x / (1 + abs(x)) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] / (one + abs(args[0])); + __device__ __forceinline__ T operator()(const T& x) const { + return x / (one + abs(x)); } }; @@ -971,11 +879,9 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); // dx = dout / (1 + abs(x))^2 - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T temp = one + abs(args[1]); - return args[0] / (temp * temp); + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + T temp = one + abs(x); + return dout / (temp * temp); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -991,10 +897,9 @@ struct CudaRelu6Functor : public BaseActivationFunctor { } // relu6(x) = min(max(0, x), 6) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { + __device__ __forceinline__ T operator()(const T& x) const { T t = static_cast(threshold); - return args[0] <= zero ? zero : (args[0] < t ? args[0] : t); + return x <= zero ? zero : (x < t ? x : t); } }; @@ -1008,11 +913,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { } // dx = (out > 0 && out < t) ? dout : 0 - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { T t = static_cast(threshold); - return (args[1] > zero && args[1] < t) ? args[0] : zero; + return (out > zero && out < t) ? dout : zero; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -1023,9 +926,8 @@ struct CudaTanhShrinkFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // tanhshrink(x) = x - tanh(x) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); return static_cast(x - tanh(x)); } }; @@ -1035,11 +937,10 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; // dx = dout * tanh(x)^2 - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); return static_cast(dout * tanh(x) * tanh(x)); } @@ -1056,9 +957,7 @@ struct CudaHardShrinkFunctor : public BaseActivationFunctor { } // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T x = args[0]; + __device__ __forceinline__ T operator()(const T& x) const { T t = static_cast(threshold); return (x > -t && x < t) ? zero : x; } @@ -1074,12 +973,9 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { } // dx = (x > -threshold && x < threshold) ? 0 : dout - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T x = args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { T t = static_cast(threshold); - return (x > -t && x < t) ? zero : args[0]; + return (x > -t && x < t) ? zero : dout; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -1099,9 +995,8 @@ struct CudaHardSigmoidFunctor : public BaseActivationFunctor { // hard_sigmoid(x) = 0, when x <= -3 // 1, when x >= 3 // x * slope + offset, otherwise - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T temp = args[0] * static_cast(slope) + static_cast(offset); + __device__ __forceinline__ T operator()(const T& x) const { + T temp = x * static_cast(slope) + static_cast(offset); T temp_max = temp > zero ? temp : zero; T temp_min = temp_max < one ? temp_max : one; return temp_min; @@ -1120,11 +1015,8 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { } // dx = (out > 0 && out < 1) ? dout * slope : 0 - // Inputs: args[0], the input dout - // args[1], the input out - __device__ __forceinline__ T operator()(const T* args) const { - T out = args[1]; - return (out > zero && out < one) ? args[0] * static_cast(slope) : zero; + __device__ __forceinline__ T operator()(const T& dout, const T& out) const { + return (out > zero && out < one) ? dout * static_cast(slope) : zero; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } @@ -1141,9 +1033,8 @@ struct CudaSwishFunctor : public BaseActivationFunctor { } // swish(x) = x / (1 + exp(-beta * x)) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + MPType x = static_cast(arg_x); MPType b = static_cast(beta); return static_cast(x / (one + exp(-b * x))); } @@ -1160,11 +1051,10 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { } // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); MPType b = static_cast(beta); MPType temp1 = one / (one + exp(-b * x)); MPType out = x * temp1; @@ -1186,9 +1076,8 @@ struct CudaThresholdedReluFunctor : public BaseActivationFunctor { } // thresholded_relu(x) = x > threshold ? x : 0 - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[0] > static_cast(threshold) ? args[0] : zero; + __device__ __forceinline__ T operator()(const T& x) const { + return x > static_cast(threshold) ? x : zero; } }; @@ -1202,10 +1091,8 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { } // dx = x > threshold ? dout : 0 - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - return args[1] > static_cast(threshold) ? args[0] : zero; + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { + return x > static_cast(threshold) ? dout : zero; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -1226,9 +1113,7 @@ struct CudaHardSwishFunctor : public BaseActivationFunctor { // x , when x >= threshold - offset // x * (x + offset) / scale, otherwise // threshold = scale = 6, offset = 3 by default - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T x = args[0]; + __device__ __forceinline__ T operator()(const T& x) const { T t = static_cast(threshold); T temp = x + static_cast(offset); T temp_max = temp > zero ? temp : zero; @@ -1254,15 +1139,12 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { // dout , when x >= threshold - offset // dout * (2 * x / scale + offset / scale), otherwise // threshold = scale = 6, offset = 3 by default - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - T x = args[1]; + __device__ __forceinline__ T operator()(const T& dout, const T& x) const { T o = static_cast(offset); T s = static_cast(scale); T temp1 = static_cast(x + o > zero); T temp2 = static_cast(x + o < static_cast(threshold)); - return args[0] * (temp1 * temp2 * (two * x + o) / s + one - temp2); + return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -1280,9 +1162,8 @@ struct CudaELUFunctor : public BaseActivationFunctor { } // elu(x) = max(0, x) + min(0, alpha * (exp(x) - 1)) - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T* args) const { - CT x = static_cast(args[0]); + __device__ __forceinline__ T operator()(const T& arg_x) const { + CT x = static_cast(arg_x); CT temp = static_cast(alpha) * (exp(x) - one); CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp); return static_cast(res); @@ -1304,11 +1185,10 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0 // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0 // dx = 0, if alpha <= 0 and x <=0 - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T* args) const { - MPType dout = static_cast(args[0]); - MPType x = static_cast(args[1]); + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); MPType a = static_cast(alpha); MPType temp_a_pos = static_cast(alpha > 0.0f); MPType temp_a_neg = static_cast(alpha <= 0.0f); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu index b549f7e3300..2f4098c2608 100644 --- a/paddle/fluid/operators/controlflow/bitwise_op.cu +++ b/paddle/fluid/operators/controlflow/bitwise_op.cu @@ -18,60 +18,46 @@ limitations under the License. */ namespace paddle { namespace operators { -#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ - template \ - struct Bitwise##func##CUDAFunctor { \ - using ELEM_TYPE = T; \ - HOSTDEVICE T operator()(const T* args) const { \ - return args[0] expr args[1]; \ - } \ - }; \ - \ - template <> \ - struct Bitwise##func##CUDAFunctor { \ - using ELEM_TYPE = bool; \ - HOSTDEVICE bool operator()(const bool* args) const { \ - return args[0] bool_expr args[1]; \ - } \ - }; - -BITWISE_BINARY_FUNCTOR(And, &, &&) -BITWISE_BINARY_FUNCTOR(Or, |, ||) -BITWISE_BINARY_FUNCTOR(Xor, ^, !=) -#undef BITWISE_BINARY_FUNCTOR +template +class BinaryBitwiseOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using T = typename Functor::ELEM_TYPE; -template -struct BitwiseNotCUDAFunctor { - using ELEM_TYPE = T; - HOSTDEVICE T operator()(const T* args) const { return ~args[0]; } -}; + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); -template <> -struct BitwiseNotCUDAFunctor { - using ELEM_TYPE = bool; - HOSTDEVICE bool operator()(const bool* args) const { return !args[0]; } + auto functor = Functor(); + std::vector ins = {x, y}; + std::vector outs = {out}; + const auto& cuda_ctx = + ctx.template device_context(); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, -1, functor); + } }; template -class BinaryBitwiseOpKernel +class UnaryBitwiseOpKernel : public framework::OpKernel { public: - using T = typename Functor::ELEM_TYPE; void Compute(const framework::ExecutionContext& ctx) const override { + using T = typename Functor::ELEM_TYPE; + + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + auto functor = Functor(); - std::vector ins; - std::vector outs; + std::vector ins = {x}; + std::vector outs = {out}; const auto& cuda_ctx = ctx.template device_context(); - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - - if (ins.size() == 1) { - LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } else { - LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } + LaunchSameDimsElementwiseCudaKernel( + cuda_ctx, ins, &outs, functor); } }; @@ -81,7 +67,7 @@ class BinaryBitwiseOpKernel namespace ops = ::paddle::operators; namespace plat = ::paddle::platform; -REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndCUDAFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrCUDAFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorCUDAFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotCUDAFunctor); +REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndFunctor); +REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrFunctor); +REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorFunctor); +REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu index 9e22d74d6e2..8e8f3f01104 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ b/paddle/fluid/operators/controlflow/compare_all_op.cu @@ -17,9 +17,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" -namespace ops = paddle::operators; -namespace plat = paddle::platform; - namespace paddle { namespace operators { @@ -38,23 +35,6 @@ struct BitwiseAdd { } }; -template -struct CudaEqualReduceFunctor { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T args[]) const { - return (args[0] == args[1]); - } -}; - -template -struct CudaEqualReduceFunctor< - T, typename std::enable_if::value>::type> { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T args[]) const { - return fabs(static_cast(args[0] - args[1])) < 1e-8; - } -}; - template class CompareReduceOpKernel : public framework::OpKernel { @@ -97,6 +77,9 @@ class CompareReduceOpKernel } // namespace operators } // namespace paddle +namespace ops = paddle::operators; +namespace plat = paddle::platform; + #define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor) \ REGISTER_OP_CUDA_KERNEL( \ op_type, \ @@ -109,5 +92,5 @@ class CompareReduceOpKernel ops::CompareReduceOpKernel>); -REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, CudaEqualReduceFunctor) +REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, EqualReduceFunctor) #undef REGISTER_COMPARE_REDUCE_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu index bf7861a03d8..fc7dce208c4 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ b/paddle/fluid/operators/controlflow/compare_op.cu @@ -21,46 +21,11 @@ namespace plat = paddle::platform; namespace paddle { namespace operators { -#define DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(func, op) \ - template \ - struct func { \ - using ELEMENT_TYPE = T; \ - inline HOSTDEVICE bool operator()(const T* args) const { \ - return args[0] op args[1]; \ - } \ - }; - -DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaLessThanFunctor, <) -DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaLessEqualFunctor, <=) -DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaGreaterThanFunctor, >) -DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaGreaterEqualFunctor, >=) -DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaEqualFunctor, ==) -DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaNotEqualFunctor, !=) -#undef DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT - -template -struct CudaEqualFunctor< - T, typename std::enable_if::value>::type> { - using ELEMENT_TYPE = T; - HOSTDEVICE bool operator()(const T* args) const { - return fabs(static_cast(args[0] - args[1])) < 1e-8; - } -}; - -template -struct CudaNotEqualFunctor< - T, typename std::enable_if::value>::type> { - using ELEMENT_TYPE = T; - HOSTDEVICE bool operator()(const T* args) const { - return fabs(static_cast(args[0] - args[1])) > 1e-8; - } -}; - template class CompareOpKernel - : public framework::OpKernel { + : public framework::OpKernel { public: - using InT = typename Functor::ELEMENT_TYPE; + using InT = typename Functor::ELEM_TYPE; using OutT = bool; void Compute(const framework::ExecutionContext& ctx) const override { auto functor = Functor(); @@ -87,10 +52,10 @@ class CompareOpKernel ops::CompareOpKernel, void>, \ ops::CompareOpKernel, void>); -REGISTER_CUDA_COMPARE_KERNEL(equal, CudaEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(not_equal, CudaNotEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(less_than, CudaLessThanFunctor) -REGISTER_CUDA_COMPARE_KERNEL(less_equal, CudaLessEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(greater_than, CudaGreaterThanFunctor) -REGISTER_CUDA_COMPARE_KERNEL(greater_equal, CudaGreaterEqualFunctor) +REGISTER_CUDA_COMPARE_KERNEL(equal, EqualFunctor) +REGISTER_CUDA_COMPARE_KERNEL(not_equal, NotEqualFunctor) +REGISTER_CUDA_COMPARE_KERNEL(less_than, LessThanFunctor) +REGISTER_CUDA_COMPARE_KERNEL(less_equal, LessEqualFunctor) +REGISTER_CUDA_COMPARE_KERNEL(greater_than, GreaterThanFunctor) +REGISTER_CUDA_COMPARE_KERNEL(greater_equal, GreaterEqualFunctor) #undef REGISTER_CUDA_COMPARE_KERNEL diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 69bcd6d0d06..bd91142882f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" @@ -24,21 +25,6 @@ namespace plat = paddle::platform; namespace paddle { namespace operators { -/* - input: an array; - return: the result of the math functor - 1. For Unary Op, the length of input array is 1, - e.g. Relu: return args[0] > 0 ? args[0] : 0; - 2. For Binary Op, the length of input array is 2, - e.g. Add: return args[0] expr args[1]; -*/ -template -struct CudaAddFunctor { - inline HOSTDEVICE T operator()(const T* args) const { - return args[0] + args[1]; - } -}; - template class ElementwiseAddKernel : public framework::OpKernel { @@ -51,7 +37,7 @@ class ElementwiseAddKernel int axis = PackTensorsIntoVector(ctx, &ins, &outs); LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, CudaAddFunctor()); + cuda_ctx, ins, &outs, axis, AddFunctor()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index bf34db09861..33b6f1d60b8 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -24,13 +24,6 @@ namespace plat = paddle::platform; namespace paddle { namespace operators { -template -struct CudaMulFunctor { - inline HOSTDEVICE T operator()(const T* args) const { - return args[0] * args[1]; - } -}; - template class ElementwiseMulKernel : public framework::OpKernel { @@ -44,7 +37,7 @@ class ElementwiseMulKernel int axis = PackTensorsIntoVector(ctx, &ins, &outs, &x_for_selectedrows); LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, CudaMulFunctor()); + cuda_ctx, ins, &outs, axis, MulFunctor()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 129c90a22be..53ac85802a6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -16,11 +16,10 @@ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" + namespace paddle { namespace operators { -#define MAX_INPUT_NUM 3 // the max num of ET for BroadcacstConfig - namespace kps = paddle::operators::kernel_primitives; struct DimensionsTransform { @@ -46,10 +45,9 @@ struct DimensionsTransform { axis++; } else { PADDLE_THROW(platform::errors::InvalidArgument( - "The %dth dimension of input tensor is expected to be equal " - "with" - "the %dth dimension of output tensor %d or 1, but recieved " - "%d.\n", + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "recieved %d.", in_idx + 1, axis + 1, out_dims[axis], in_dim[in_idx])); } } while (in_idx < in_dim.size()); @@ -61,10 +59,9 @@ struct DimensionsTransform { in_idx++; } else { PADDLE_THROW(platform::errors::InvalidArgument( - "The %dth dimension of input tensor is expected to be equal " - "with" - "the %dth dimension of output tensor %d or 1, but recieved " - "%d.\n", + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "recieved %d.", in_idx + 1, in_idx + 1, out_dims[in_idx], in_dim[in_idx])); } } while (in_idx < dim_size); @@ -165,79 +162,71 @@ struct DimensionsTransform { } }; -template +template __device__ __forceinline__ void LoadData( T *dst, const T *__restrict__ src, uint32_t block_offset, - const kps::details::BroadcastConfig &config, int numel, int num, + const kps::details::BroadcastConfig &config, int numel, int num, bool need_broadcast) { // numel : whole num of output // num: how many data will be deal with in this time if (need_broadcast) { - kps::ReadDataBc( - dst, src, block_offset, config, numel, 1, 1); + kps::ReadDataBc(dst, src, block_offset, + config, numel, 1, 1); } else { kps::ReadData(dst, src + block_offset, num); } } -template +template __device__ void DealSegment( - const framework::Array &in, OutT *out, - const framework::Array &use_broadcast, uint32_t numel, - const framework::Array, - MAX_INPUT_NUM> &configlists, + const framework::Array &ins, OutT *out, + const framework::Array &use_broadcast, uint32_t numel, + const framework::Array, Arity> &configs, int num, Functor func) { - InT args[ET][VecSize]; + InT args[Arity][VecSize]; OutT result[VecSize]; + int block_offset = blockIdx.x * blockDim.x * VecSize; -// load + #pragma unroll - for (int i = 0; i < ET; i++) { + for (int i = 0; i < Arity; i++) { kps::Init(args[i], static_cast(1.0f)); - LoadData(args[i], in[i], block_offset, - configlists[i], numel, num, - use_broadcast[i]); + LoadData(args[i], ins[i], block_offset, + configs[i], numel, num, + use_broadcast[i]); } - // compute - if (ET == kUnary) { - kps::ElementwiseUnary(result, args[0], - func); - } else if (ET == kBinary) { - kps::ElementwiseBinary(result, args[0], - args[1], func); - } else { - kps::ElementwiseTernary( - result, args[0], args[1], args[2], func); - } - // compute + + const bool kCallElementwiseAny = + platform::FunctionTraits::has_pointer_args; + ElementwisePrimitiveCaller()(func, args, result); kps::WriteData(out + block_offset, result, num); } -template +template __global__ void BroadcastKernel( - framework::Array in, OutT *out, - framework::Array use_broadcast, uint32_t numel, - framework::Array, MAX_INPUT_NUM> - configlists, + framework::Array ins, OutT *out, + framework::Array use_broadcast, uint32_t numel, + framework::Array, Arity> configs, int main_tid, int tail_tid, Functor func) { int block_offset = blockIdx.x * blockDim.x * VecSize; // data offset of this block if (blockIdx.x < main_tid) { int num = blockDim.x * VecSize; // blockIdx.x < main_tid - DealSegment( - in, out, use_broadcast, numel, configlists, num, func); + DealSegment( + ins, out, use_broadcast, numel, configs, num, func); } else { // reminder int num = tail_tid; - DealSegment( - in, out, use_broadcast, numel, configlists, num, func); + DealSegment( + ins, out, use_broadcast, numel, configs, num, func); } } -template +template void LaunchKernel(const platform::CUDADeviceContext &ctx, const std::vector &ins, framework::Tensor *out, Functor func, @@ -251,53 +240,58 @@ void LaunchKernel(const platform::CUDADeviceContext &ctx, auto stream = ctx.stream(); OutT *out_data = out->data(); - framework::Array, MAX_INPUT_NUM> - configlists; - framework::Array use_broadcast; - framework::Array ins_data; + framework::Array, Arity> configs; + framework::Array use_broadcast; + framework::Array ins_data; - for (int i = 0; i < ET; i++) { + for (int i = 0; i < Arity; i++) { use_broadcast[i] = (ins[i]->numel() != numel); ins_data[i] = ins[i]->data(); if (use_broadcast[i]) { // get the broadcast config, // if data shape is[m, n], then you should set data_dim = {n, m} // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} - configlists[i] = kps::details::BroadcastConfig( + configs[i] = kps::details::BroadcastConfig( merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); } } - BroadcastKernel<<>>( - ins_data, out_data, use_broadcast, numel, configlists, main_tid, tail_tid, + BroadcastKernel<<>>( + ins_data, out_data, use_broadcast, numel, configs, main_tid, tail_tid, func); } -template -void LaunchBroadcastKernelForDifferentDimSize( +template +void LaunchBroadcastKernelForDifferentVecSize( const platform::CUDADeviceContext &ctx, const std::vector &ins, framework::Tensor *out, int axis, Functor func) { const auto merge_dims = DimensionsTransform(ins, out->dims(), axis); -#define DIM_SIZE(size) \ - case size: { \ - LaunchKernel(ctx, ins, out, func, \ - merge_dims); \ + +#define CALL_BROADCAST_FOR_DIM_SIZE(rank) \ + case rank: { \ + LaunchKernel(ctx, ins, out, \ + func, merge_dims); \ } break; switch (merge_dims.dim_size) { - DIM_SIZE(1); - DIM_SIZE(2); - DIM_SIZE(3); - DIM_SIZE(4); - DIM_SIZE(5); - DIM_SIZE(6); - DIM_SIZE(7); - DIM_SIZE(8); + CALL_BROADCAST_FOR_DIM_SIZE(1); + CALL_BROADCAST_FOR_DIM_SIZE(2); + CALL_BROADCAST_FOR_DIM_SIZE(3); + CALL_BROADCAST_FOR_DIM_SIZE(4); + CALL_BROADCAST_FOR_DIM_SIZE(5); + CALL_BROADCAST_FOR_DIM_SIZE(6); + CALL_BROADCAST_FOR_DIM_SIZE(7); + CALL_BROADCAST_FOR_DIM_SIZE(8); + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "The maximum dimension of input tensor is expected to be less than " + "%d, but recieved %d.\n", + merge_dims.dim_size, framework::DDim::kMaxRank)); + } } -#undef DIM_SIZE +#undef CALL_BROADCAST_FOR_DIM_SIZE } template @@ -305,11 +299,21 @@ void LaunchBroadcastElementwiseCudaKernel( const platform::CUDADeviceContext &ctx, const std::vector &ins, std::vector *outs, int axis, Functor func) { - PADDLE_ENFORCE_EQ(ET, ElementwiseType::kBinary, + using Traits = platform::FunctionTraits; + const int kArity = + Traits::has_pointer_args ? static_cast(ET) : Traits::arity; + PADDLE_ENFORCE_EQ(ins.size(), kArity, platform::errors::InvalidArgument( - "Currently, only Support binary calculation, " - "but received %d input tensors.\n", - static_cast(ET))); + "The number of inputs is expected to be equal to the " + "arity of functor. But recieved: the number of inputs " + "is %d, the arity of functor is %d.", + ins.size(), kArity)); + PADDLE_ENFORCE_EQ(kArity, 2, + platform::errors::InvalidArgument( + "Currently only broadcast of binary is supported and " + "verified, but received %d.", + kArity)); + int in_vec_size = 4; framework::Tensor *out = (*outs)[0]; for (auto *in : ins) { @@ -322,18 +326,18 @@ void LaunchBroadcastElementwiseCudaKernel( switch (vec_size) { case 4: { - LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, - axis, func); + LaunchBroadcastKernelForDifferentVecSize( + ctx, ins, out, axis, func); break; } case 2: { - LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, - axis, func); + LaunchBroadcastKernelForDifferentVecSize( + ctx, ins, out, axis, func); break; } case 1: { - LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, - axis, func); + LaunchBroadcastKernelForDifferentVecSize( + ctx, ins, out, axis, func); break; } default: { @@ -369,7 +373,5 @@ void LaunchElementwiseCudaKernel( } } -#undef MAX_INPUT_NUM - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index dd8e3d409c0..7bbfefba20f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -37,8 +37,10 @@ limitations under the License. */ #endif #include +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" + #ifdef __HIPCC__ constexpr int ELEMWISE_MAX_BLOCK_DIM = 256; #else @@ -278,128 +280,6 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x, } } -#if defined(__NVCC__) || defined(__HIPCC__) -template -__global__ void ElementwiseKernel(const T *__restrict__ x_data, - const T *__restrict__ y_data, - OutType *__restrict__ out_data, int n, - int post, const size_t total, Functor func) { - int tid = threadIdx.x + blockDim.x * blockIdx.x; - int stride = blockDim.x * gridDim.x; - - for (int i = tid; i < total; i += stride) { - int idx = i / post % n; - out_data[i] = func(x_data[i], y_data[idx]); - } -} - -template -void ComputeElementwiseCUDA(const framework::Tensor *x, - const framework::Tensor *y, framework::Tensor *z, - int pre, int n, int post, - const platform::CUDADeviceContext &ctx, - Functor func, const bool is_xsize_larger = true) { - const T *x_data = x->data(); - const T *y_data = y->data(); - OutType *out_data = z->mutable_data(ctx.GetPlace()); - - int numel = pre * n * post; - int threads = 256; - int blocks = (numel + threads - 1) / threads; - - if (is_xsize_larger) { - ElementwiseKernel<<>>( - x_data, y_data, out_data, n, post, numel, func); - - } else { - ElementwiseKernel<<>>( - y_data, x_data, out_data, n, post, numel, func); - } -} - -template -__global__ void CommonForwardBroadcastCUDAKernel( - const int *x_strides_array, const int *y_strides_array, - const int *out_dims_array, const T *x, const T *y, OutType *out, - int out_size, int max_dim, Functor func, const bool is_xsize_larger) { - for (int out_index = blockIdx.x * blockDim.x + threadIdx.x; - out_index < out_size; out_index += blockDim.x * gridDim.x) { - int x_index = 0; - int y_index = 0; - int out_index_quotient = out_index; - int remainder = 0; -#pragma unroll - for (int i = max_dim - 1; i >= 0; --i) { - GetDivMod(out_index_quotient, out_dims_array[i], &out_index_quotient, - &remainder); - x_index += remainder * x_strides_array[i]; - y_index += remainder * y_strides_array[i]; - } - if (is_xsize_larger) { - out[out_index] = func(x[x_index], y[y_index]); - } else { - out[out_index] = func(y[y_index], x[x_index]); - } - } -} - -template -void CommonForwardBroadcastCUDA( - const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z, int *x_dims_array, int *y_dims_array, - int *out_dims_array, int max_dim, const platform::CUDADeviceContext &ctx, - Functor func, const bool is_xsize_larger = true) { - const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - auto cplace = platform::CPUPlace(); - const T *x_data = x->data(); - const T *y_data = y->data(); - OutType *out_data = z->mutable_data(ctx.GetPlace()); - - std::vector x_strides_array(max_dim); - std::vector y_strides_array(max_dim); - int x_stride = 1; - int y_stride = 1; - for (int i = max_dim - 1; i >= 0; i--) { - x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride; - y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride; - x_stride *= x_dims_array[i]; - y_stride *= y_dims_array[i]; - } - - int bytes = max_dim * sizeof(int); - auto x_strides_array_tmp = memory::Alloc(ctx, bytes); - int *x_strides_array_gpu = - reinterpret_cast(x_strides_array_tmp->ptr()); - memory::Copy(gplace, x_strides_array_gpu, cplace, x_strides_array.data(), - bytes, ctx.stream()); - - auto y_strides_array_tmp = memory::Alloc(ctx, bytes); - int *y_strides_array_gpu = - reinterpret_cast(y_strides_array_tmp->ptr()); - memory::Copy(gplace, y_strides_array_gpu, cplace, y_strides_array.data(), - bytes, ctx.stream()); - - auto out_dims_array_tmp = memory::Alloc(ctx, bytes); - int *out_dims_array_gpu = reinterpret_cast(out_dims_array_tmp->ptr()); - memory::Copy(gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, - ctx.stream()); - - const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim, - 1, std::multiplies()); - dim3 gird_size = dim3( - (out_size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); - dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); - - CommonForwardBroadcastCUDAKernel< - Functor, T, OutType><<>>( - x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu, x_data, - y_data, out_data, out_size, max_dim, func, is_xsize_larger); -} - -#endif // __NVCC__ or __HIPCC__ - template void CommonGradBroadcastCPU( const framework::Tensor &x, const framework::Tensor &y, @@ -1917,21 +1797,10 @@ void CommonElementwiseBroadcastForward( y_dims_array.data(), out_dims_array.data(), max_dim, axis); - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) - CommonForwardBroadcastCUDA( - x, y, z, x_dims_array.data(), y_dims_array.data(), - out_dims_array.data(), max_dim, - ctx.template device_context(), func, - is_xsize_larger); -#endif - } else { - CommonForwardBroadcastCPU( - x, y, z, x_dims_array.data(), y_dims_array.data(), - out_dims_array.data(), max_dim, - ctx.template device_context(), func, - is_xsize_larger); - } + CommonForwardBroadcastCPU( + x, y, z, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(), + max_dim, ctx.template device_context(), func, + is_xsize_larger); } template @@ -1975,12 +1844,35 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx, } } +// It is a common implementation to compute binary calculation with the support +// of broadcast, supporting both CPU and GPU. +// - CPU implementation cannot support the case when x needs broadcast, thus +// this function need to be called with XxxFunctor and XxxInverseFunctor, +// like paddle/fluid/operators/elementwise/elementwise_add_op.h#L49 - L55. +// - GPU implementation supports all the broadcast cases, thus there is no need +// to define and call with XxxInverseFunctor. +// TODO(liuyiqun): optimize the CPU implementation to support all broadcast +// cases and avoid the need of XxxInverseFunctor. template void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, int axis, Functor func, framework::Tensor *z) { + if (platform::is_gpu_place(ctx.GetPlace())) { +#if defined(__NVCC__) || defined(__HIPCC__) + std::vector ins = {x, y}; + std::vector outs = {z}; + z->mutable_data(ctx.GetPlace()); + + const auto &dev_ctx = + ctx.template device_context(); + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, axis, func); +#endif + return; + } + auto x_dims = x->dims(); auto y_dims = y->dims(); bool is_xsize_larger = true; @@ -2029,15 +1921,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, return; } - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) - ComputeElementwiseCUDA( - x, y, z, pre, n, post, - ctx.template device_context(), func, - is_xsize_larger); -#endif - return; - } if (post == 1) { functor.RunRowWise(n, pre); return; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index e591b145d23..83aff3b5577 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #pragma once #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/fluid/platform/function_traits.h" #ifdef __HIPCC__ #define ELEMENTWISE_BLOCK_SIZE 256 @@ -28,7 +29,8 @@ namespace paddle { namespace operators { namespace kps = paddle::operators::kernel_primitives; -enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3 }; + +enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 }; /* * According to NVIDIA, if number of threads per block is 64/128/256/512, @@ -55,8 +57,9 @@ inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx, } template -int GetVectorizedSizeForIO(const std::vector &ins, - const std::vector &outs) { +int GetVectorizedSizeForTensors( + const std::vector &ins, + const std::vector &outs) { int vec_size = 4; for (auto iter = ins.begin(); iter != ins.end(); ++iter) { vec_size = std::min(vec_size, @@ -69,56 +72,88 @@ int GetVectorizedSizeForIO(const std::vector &ins, return vec_size; } -template -__device__ void DealSegment( - const framework::Array &in, OutT *out, int num, - Functor func) { - int data_offset = VecSize * blockIdx.x * blockDim.x; - InT args[ET][VecSize]; - OutT result[VecSize]; -// load data -#pragma unroll - for (int i = 0; i < ET; i++) { - kps::Init(args[i], static_cast(1.0f)); - kps::ReadData(args[i], in[i] + data_offset, - num); +template +struct ElementwisePrimitiveCaller { + __device__ inline OutT operator()(Functor func, InT (*args)[VecSize], + OutT *result); +}; + +template +struct ElementwisePrimitiveCaller { + __device__ inline OutT operator()(Functor func, InT (*args)[VecSize], + OutT *result) { + kps::ElementwiseAny(result, args, + func); } +}; - // compute - if (ET == kUnary) { +template +struct ElementwisePrimitiveCaller { + __device__ inline OutT operator()(Functor func, InT (*args)[VecSize], + OutT *result) { kps::ElementwiseUnary(result, args[0], func); - } else if (ET == kBinary) { + } +}; + +template +struct ElementwisePrimitiveCaller { + __device__ inline OutT operator()(Functor func, InT (*args)[VecSize], + OutT *result) { kps::ElementwiseBinary(result, args[0], args[1], func); - } else { + } +}; + +template +struct ElementwisePrimitiveCaller { + __device__ inline OutT operator()(Functor func, InT **args, OutT *result) { kps::ElementwiseTernary( result, args[0], args[1], args[2], func); } +}; + +template +__device__ void DealSegment( + const framework::Array &in, OutT *out, + int num, Functor func) { + InT args[Arity][VecSize]; + OutT result[VecSize]; - // store + int data_offset = VecSize * blockIdx.x * blockDim.x; + +#pragma unroll + for (int i = 0; i < Arity; i++) { + kps::Init(args[i], static_cast(1.0f)); + kps::ReadData(args[i], in[i] + data_offset, + num); + } + + const bool kCallElementwiseAny = + platform::FunctionTraits::has_pointer_args; + ElementwisePrimitiveCaller()(func, args, result); kps::WriteData(out + data_offset, result, num); } -template +template __global__ void ElementVectorizeKernel( - framework::Array in, OutT *out, int size, + framework::Array ins, OutT *out, int size, Functor func) { int data_offset = VecSize * blockIdx.x * blockDim.x; int num = size - data_offset; // the num this time have to deal with if (VecSize * blockDim.x > num) { // reminder segment - DealSegment(in, out, num, func); + DealSegment(ins, out, num, func); } else { // complete segment - DealSegment(in, out, num, func); + DealSegment(ins, out, num, func); } } -template +template void ElementwiseCudaKernel(const platform::CUDADeviceContext &ctx, const std::vector &ins, std::vector *outs, @@ -129,14 +164,14 @@ void ElementwiseCudaKernel(const platform::CUDADeviceContext &ctx, ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size; auto stream = ctx.stream(); - OutT *out = (*outs)[0]->data(); - framework::Array in; - for (int i = 0; i < ET; i++) { - in[i] = ins[i]->data(); + OutT *out_data = (*outs)[0]->data(); + framework::Array ins_data; + for (int i = 0; i < Arity; i++) { + ins_data[i] = ins[i]->data(); } - ElementVectorizeKernel<<>>( - in, out, numel, func); + ElementVectorizeKernel<<>>( + ins_data, out_data, numel, func); } template @@ -144,17 +179,30 @@ void LaunchSameDimsElementwiseCudaKernel( const platform::CUDADeviceContext &ctx, const std::vector &ins, std::vector *outs, Functor func) { + using Traits = platform::FunctionTraits; + const int kArity = + Traits::has_pointer_args ? static_cast(ET) : Traits::arity; + PADDLE_ENFORCE_EQ(ins.size(), kArity, + platform::errors::InvalidArgument( + "The number of inputs is expected to be equal to the " + "arity of functor. But recieved: the number of inputs " + "is %d, the arity of functor is %d.", + ins.size(), kArity)); + // calculate the max vec_size for all ins and outs - int vec_size = GetVectorizedSizeForIO(ins, *outs); + int vec_size = GetVectorizedSizeForTensors(ins, *outs); switch (vec_size) { case 4: - ElementwiseCudaKernel(ctx, ins, outs, func); + ElementwiseCudaKernel(ctx, ins, outs, + func); break; case 2: - ElementwiseCudaKernel(ctx, ins, outs, func); + ElementwiseCudaKernel(ctx, ins, outs, + func); break; case 1: - ElementwiseCudaKernel(ctx, ins, outs, func); + ElementwiseCudaKernel(ctx, ins, outs, + func); break; default: { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index da9610243f7..2643cc0e7a2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -22,13 +22,6 @@ namespace plat = paddle::platform; namespace paddle { namespace operators { -template -struct CudaSubFunctor { - inline HOSTDEVICE T operator()(const T* args) const { - return args[0] - args[1]; - } -}; - template class ElementwiseSubKernel : public framework::OpKernel { @@ -41,7 +34,7 @@ class ElementwiseSubKernel int axis = PackTensorsIntoVector(ctx, &ins, &outs); LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, CudaSubFunctor()); + cuda_ctx, ins, &outs, axis, SubFunctor()); } }; diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 37e7bd9caa6..a8bd35a1b73 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -52,10 +52,8 @@ template using ReduceParamType = typename CudnnDataType::BatchNormParamType; template -struct CudaAddFunctor { - inline HOSTDEVICE T operator()(const T* args) const { - return args[0] + args[1]; - } +struct AddFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; } }; template out_dims = {n, m}; configlists[1] = kps::details::BroadcastConfig<2>(out_dims, input1_dims, 2); - auto func = CudaAddFunctor(); + auto func = AddFunctor(); auto stream = ctx.stream(); switch (vec_size) { case 4: { diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h index 58642ef2631..2898a11fd7a 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h @@ -21,7 +21,6 @@ #include #endif -// #include #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" @@ -135,53 +134,114 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { } // namespace details -/*************************** Compute Function****************************/ +/** + * @brief unary function + * @param + * T: data type of in + * OutT: data type of out + * NX: the cols of in + * NY: the rows of in + * BlockSize: the config of this device + * OpFunc: compute functor which have an operator() as following + * template + * struct XxxFunctor { + * HOSTDEVICE OutT operator()(const T& a) const { + * return ...; + * } + * }; + */ +template +__device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in, + OpFunc compute) { +#pragma unroll + for (int idx = 0; idx < NX * NY; idx++) { + out[idx] = static_cast(compute(in[idx])); + } +} /** * @brief binary function, in1 and in2 have same shape - * @param: + * @param * T: data type of in1, in2 * OutT: data type of out * NX: the cols of in1, in2 * NY: the rows of in1, in2 * BlockSize: the config of this device - * OpFunc: compute functor eg: in1 + in2, in1 - in2 + * OpFunc: compute functor which have an operator() as following + * template + * struct XxxFunctor { + * HOSTDEVICE OutT operator()(const T& a, const T& b) const { + * return ...; + * } + * }; */ template __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1, const T* in2, OpFunc compute) { - T args[2]; #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { - args[0] = in1[idx]; - args[1] = in2[idx]; - out[idx] = static_cast(compute(args)); + out[idx] = static_cast(compute(in1[idx], in2[idx])); } } /** * @brief ternary function, in1, in2 and in3 have same shape - * @param: + * @param * T: data type of in1, in2, in3 * OutT: data type of out * NX: the cols of in1, in2 * NY: the rows of in1, in2 * BlockSize: the config of this device - * OpFunc: compute functor eg: out = in1 * in2 + in3 + * OpFunc: compute functor which have an operator() as following + * template + * struct XxxFunctor { + * HOSTDEVICE OutT operator()(const T& a, const T& b, const T& c) const { + * return ...; + * } + * }; */ template __device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1, const T* in2, const T* in3, OpFunc compute) { - T args[3]; #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { - args[0] = in1[idx]; - args[1] = in2[idx]; - args[2] = in3[idx]; + out[idx] = static_cast(compute(in1[idx], in2[idx], in3[idx])); + } +} + +/** + * @brief a general function for elementwise computation, all inputs have + * the same shape. + * @param + * T: data type of in1, in2, in3 + * OutT: data type of out + * NX: the cols of in1, in2 + * NY: the rows of in1, in2 + * BlockSize: the config of this device + * OpFunc: compute functor which have an operator() as following + * template + * struct XxxFunctor { + * HOSTDEVICE OutT operator()(const T* args) const { + * return ...; + * } + * }; + */ +template +__device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY], + OpFunc compute) { + T args[Arity]; +#pragma unroll + for (int idx = 0; idx < NX * NY; ++idx) { +#pragma unroll + for (int j = 0; j < Arity; ++j) { + args[j] = ins[j][idx]; + } out[idx] = static_cast(compute(args)); } } @@ -189,7 +249,7 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1, /** * @brief cycle binary function, in1's shape size is [1, NX], in2's shape size * is [NY, NX], out's shape size is [NY, NX] - * @param: + * @param * T: data type of in1, in2 * OutT: data type of out * NX: the cols of in1, in2 @@ -211,26 +271,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1, } } -/** - * @brief unary function - * @param: - * T: data type of in - * OutT: data type of out - * NX: the cols of in - * NY: the rows of in - * BlockSize: the config of this device - * OpFunc: compute functor eg: relu, exp - */ -template -__device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in, - OpFunc compute) { -#pragma unroll - for (int idx = 0; idx < NX * NY; idx++) { - out[idx] = static_cast(compute(in + idx)); - } -} - /** * @brief reduce function, in's shape size is [NX, NY]. * If ReduceMode == kLocalMode then reduce NX, the shape of out is [NY, 1], @@ -238,7 +278,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in, * shape of out is [NY, NX]. If reduce_last_dim is false and reduce_num was * split, BlockYReduce will be called. If reduce_last_dim is true and * reduce_num was split, BlockXReduce will be called - * @typename: + * @typename * T: data type of in * NX: the cols of in * NY: the rows of in diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu index befd31e3bd8..baf86c99b56 100644 --- a/paddle/fluid/operators/lgamma_op.cu +++ b/paddle/fluid/operators/lgamma_op.cu @@ -15,18 +15,14 @@ #include #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/lgamma_op.h" -#include "paddle/fluid/operators/math/complex_functors.h" namespace paddle { namespace operators { -template -struct CudaLgammaFunctor; - template -struct CudaLgammaFunctor>> { - __device__ __forceinline__ T operator()(const T* args) const { - return Eigen::numext::lgamma(args[0]); +struct CudaLgammaFunctor { + __device__ __forceinline__ T operator()(const T& x) const { + return Eigen::numext::lgamma(x); } }; @@ -37,15 +33,14 @@ class LgammaKernel void Compute(const framework::ExecutionContext& context) const override { const Tensor* x = context.Input("X"); Tensor* out = context.Output("Out"); - out->mutable_data>(context.GetPlace()); + out->mutable_data(context.GetPlace()); auto& dev_ctx = context.device_context(); std::vector ins = {x}; std::vector outs = {out}; auto functor = CudaLgammaFunctor(); - LaunchSameDimsElementwiseCudaKernel>(dev_ctx, ins, &outs, - functor); + LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); } }; diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu index c6f85abac97..d85a262b5e9 100644 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ b/paddle/fluid/operators/matrix_rank_op.cu @@ -129,17 +129,10 @@ class MatrixRankGPUKernel : public framework::OpKernel { compare_result.mutable_data(detail::NewAxisDim(dim_out, k), context.GetPlace()); int axis = -1; - if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { - ElementwiseComputeEx, platform::CUDADeviceContext, - T, int64_t>(context, &eigenvalue_tensor, &tol_tensor, - axis, GreaterThanFunctor(), - &compare_result); - } else { - ElementwiseComputeEx, platform::CUDADeviceContext, T, - int64_t>(context, &eigenvalue_tensor, &tol_tensor, - axis, LessThanFunctor(), - &compare_result); - } + ElementwiseComputeEx, platform::CUDADeviceContext, T, + int64_t>(context, &eigenvalue_tensor, &tol_tensor, + axis, GreaterThanFunctor(), + &compare_result); auto dito_int = math::DeviceIndependenceTensorOperations(context); diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index bdf402397dd..f266aa0cba0 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include #include #include @@ -296,14 +297,23 @@ struct DeviceIndependenceTensorOperations { framework::Tensor ret; std::vector out_shape = GetBroadcastShape({&x, &y}); ret.Resize(framework::make_ddim(out_shape)); - if (x.dims().size() >= y.dims().size()) { + if (platform::is_gpu_place(context.GetPlace())) { +#if defined(__NVCC__) || defined(__HIPCC__) + // For GPU, there is no need to define XxxInverseFunctor and call + // ElementwiseComputeEx in two branches. ElementwiseComputeEx, DeviceContext, T>( context, &x, &y, -1, SubFunctor(), &ret); +#endif } else { - ElementwiseComputeEx, DeviceContext, T>( - // This is copyed from elementwise_sub, which means we - // need reverse will xrank < yrank - context, &x, &y, -1, InverseSubFunctor(), &ret); + if (x.dims().size() >= y.dims().size()) { + ElementwiseComputeEx, DeviceContext, T>( + context, &x, &y, -1, SubFunctor(), &ret); + } else { + ElementwiseComputeEx, DeviceContext, T>( + // This is copyed from elementwise_sub, which means we + // need reverse will xrank < yrank + context, &x, &y, -1, InverseSubFunctor(), &ret); + } } return ret; } diff --git a/paddle/fluid/platform/function_traits.h b/paddle/fluid/platform/function_traits.h new file mode 100644 index 00000000000..e1041184e64 --- /dev/null +++ b/paddle/fluid/platform/function_traits.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.1 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.1 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace platform { + +// Declare a template class with a single template parameter. +template +struct FunctionTraits; + +// A forwarding trait allowing functors (objects which have an operator()) +// to be used with this traits class. +template +struct FunctionTraits : public FunctionTraits {}; + +// A partial specialization of FunctionTraits for pointers to member functions +// and has const/non-const class member functions. +template +struct FunctionTraits + : public FunctionTraits {}; +template +struct FunctionTraits + : public FunctionTraits {}; + +// An implementation for common function. +template +struct FunctionTraits { + static const size_t arity = sizeof...(Args); + static const bool has_pointer_args = + (arity == 1) && + (std::is_pointer< + typename std::tuple_element<0, std::tuple>::type>::value); +}; + +} // namespace platform +} // namespace paddle -- GitLab