diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 68820443a2d5a68c73c0d5ebb855519fddbbf3d2..49e517dc9b3f3271ef26dfbece46f799ef805c57 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -128,6 +128,6 @@ TEST(Generated, ElementwiseAdd) { } // namespace egr -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 0ee171c73c6600b95b9b093ef7e818855f53002d..b86865e2d126fbfc0b00495a6e3208932ac6de39 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -255,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) { } } // namespace egr -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index eadb00b9e88e14075c46a53c711fd43774f26581..28e1145db42123b9dacfa9e359e08476d16ab4c0 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -31,7 +31,7 @@ USE_OP(slice); USE_OP(concat); USE_OP(matmul); USE_OP_ITSELF(elementwise_add); -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(tanh); USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); @@ -47,7 +47,7 @@ USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); USE_OP_ITSELF(elementwise_mul_grad); -USE_OP(sigmoid_grad); +USE_OP_ITSELF(sigmoid_grad); USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP(slice_grad); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 1946f9e28388e3ab6d1d580d0f7d91c1ef3e604f..1ad82df41737c4093d0b5518c754ed85c505b8be 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace paddle USE_OP_ITSELF(relu); -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(tanh); USE_OP(relu6); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index c835cf8ea148064648352bb5c6fbd533b02acda0..845d0ed073b32cc136ec6b9d76c9e3073d7b051a 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1492,6 +1492,10 @@ REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor, REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor); +REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, + HardSigmoidGradFunctor); +REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor, + LogSigmoidGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1526,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad, ops::SigmoidTripleGradFunctor::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -// Register Sigmoid/GradSigmoid Kernels -REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor, - SigmoidGradFunctor); - -// Register DoubleGrad Kernel -REGISTER_OP_CPU_KERNEL( - sigmoid_grad_grad, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>); - -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - sigmoid_triple_grad, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>); - /* ========================================================================== */ /* ========================== tanh register ============================= */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4f197b95b21742e4af0889aa230f58821bf542ba..f1984af6e15eac6682bd341f470727b899e82f3a 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -238,15 +238,6 @@ struct BaseActivationFunctor { AttrPair GetAttrs() { return AttrPair(); } }; -// sigmoid(x) = 1 / (1 + exp(-x)) -template -struct SigmoidFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); - } -}; - #define USE_PHI_FUNCTOR(name) \ template \ using name##Functor = phi::funcs::name##Functor; \ @@ -285,160 +276,15 @@ USE_PHI_FUNCTOR(TanhShrink) USE_PHI_FUNCTOR(Silu) USE_PHI_FUNCTOR(ELU) USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU) +USE_PHI_FUNCTOR(Sigmoid) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid) +USE_PHI_FUNCTOR(LogSigmoid) +USE_PHI_FUNCTOR(HardSigmoid) template using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor; -template -struct SigmoidGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * out * (static_cast(1) - out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -/* - Out - DOut -> SigmoidGradGrad -> DOutNew - DDX DDOut - - DDOut = (1-Out)*Out*DDX - DOutNew = (1-2*Out)*DOut*DDX -*/ -template -struct SigmoidGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); - - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); - dout_new.device(*d) = - (static_cast(1) - static_cast(2) * out) * dout * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); - ddout.device(*d) = (static_cast(1) - out) * out * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -/* - Out - DOut D_Dout - DDx -> SigmoidTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (1-2*Out)*DDx*D_Dout_new - D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new - D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct SigmoidTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( - d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( - d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); - d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - - static_cast(2) * dout * ddx * d_dOutNew; - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); - d_dOut.device(*d) = - (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); - d_ddx.device(*d) = - (static_cast(1) - out) * out * d_ddOut + - (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -// Originally: logsigmoid(x) = -log (1 + exp(-x)) -// For numerical stability, we can use the log-sum-exp trick: -// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ -// We can rewrite the above equation as: -// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] -// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) -// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - -// max(-x, 0))) -// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) -// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) -// -// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) -// + exp(-x - max(-x, 0)))) -template -struct LogSigmoidFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) - out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); - } -}; - -// Originally: f' = exp(-x) / (1 + exp(-x)) -// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + -// exp(-x - max(-x, 0))) -template -struct LogSigmoidGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) - dx.device(d) = - dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // exp(x) = e^x template struct ExpFunctor : public BaseActivationFunctor { @@ -1101,43 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct HardSigmoidFunctor : public BaseActivationFunctor { - float slope; - float offset; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto temp = x * static_cast(slope) + static_cast(offset); - out.device(d) = - temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); - } -}; - -template -struct HardSigmoidGradFunctor : public BaseActivationFunctor { - float slope; - float offset; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((out > static_cast(0)) * (out < static_cast(1))) - .template cast() * - static_cast(slope); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct SwishFunctor : public BaseActivationFunctor { float beta; @@ -1365,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } -template -class SigmoidDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut; - framework::Tensor *dOutNew, *ddOut; - Out = ddX = dOut = nullptr; - dOutNew = ddOut = nullptr; - // extract ddx(input) and out(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - // set output ddout - ddOut = ctx.Output("DDOut"); - // extract dOut(intput) - dOut = ctx.Input("DOut"); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - dOutNew = ctx.Output("DOutNew"); - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, dOutNew, ddOut); - } -}; - -// Out, DDX, DOut, D_DDOut, D_DOut_New // input -// D_OutNew, D_DOut, D_DDx // output -template -class SigmoidTripleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; - framework::Tensor *d_OutNew, *d_dOut, *d_ddx; - Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; - d_OutNew = d_dOut = d_ddx = nullptr; - - // extract ddx(input), out(input), dOut(input), d_ddOut(input), - // d_dOutNew(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - dOut = ctx.Input("DOut"); - d_ddOut = ctx.Input("D_DDOut"); - d_dOutNew = ctx.Input("D_DOut_New"); - - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_ddOut, platform::errors::NotFound( - "Cannot get input Variable d_ddOut, variable name = %s", - ctx.InputName("D_DDOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_dOutNew, - platform::errors::NotFound( - "Cannot get input Variable d_dOutNew, variable name = %s", - ctx.InputName("D_DOutNew"))); - - // set output d_OutNew、d_dOut、d_ddx - d_dOut = ctx.Output("D_DOut"); - d_OutNew = ctx.Output("D_OutNew"); - d_ddx = ctx.Output("D_DDx"); - - if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input - d_dOut, d_OutNew, d_ddx); // output - } -}; - -template -class TanhDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut; - framework::Tensor *dOutNew, *ddOut; - Out = ddX = dOut = nullptr; - dOutNew = ddOut = nullptr; - - // extract ddx(input) and out(input) - auto ddx_var = ctx.InputVar("DDX"); - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable ddx, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - out_var, platform::errors::NotFound( - "Cannot get input Variable out, variable name = %s", - ctx.InputName("Out"))); - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - - // set output ddout - auto ddout_var = ctx.OutputVar("DDOut"); - if (ddout_var) { - ddOut = ctx.Output("DDOut"); - } - - // extract dOut(intput) - auto dout_var = ctx.InputVar("DOut"); - PADDLE_ENFORCE_NOT_NULL( - dout_var, platform::errors::NotFound( - "Cannot get input Variable dout_var, variable name = %s", - ctx.InputName("DOut"))); - dOut = ctx.Input("DOut"); - - // set output dout_new - auto dout_new_var = ctx.OutputVar("DOutNew"); - if (dout_new_var) { - dOutNew = ctx.Output("DOutNew"); - } - - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, dOutNew, ddOut); - } -}; - -template -class TanhTripeGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; - framework::Tensor *d_OutNew, *d_dOut, *d_ddx; - Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; - d_OutNew = d_dOut = d_ddx = nullptr; - - // extract ddx(input), out(input), dOut(input), d_ddOut(input), - // d_dOutNew(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - dOut = ctx.Input("DOut"); - d_ddOut = ctx.Input("D_DDOut"); - d_dOutNew = ctx.Input("D_DOut_New"); - - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_ddOut, platform::errors::NotFound( - "Cannot get input Variable d_ddOut, variable name = %s", - ctx.InputName("D_DDOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_dOutNew, - platform::errors::NotFound( - "Cannot get input Variable d_dOutNew, variable name = %s", - ctx.InputName("D_DOutNew"))); - - // set output d_OutNew、d_dOut、d_ddx - d_dOut = ctx.Output("D_DOut"); - d_OutNew = ctx.Output("D_OutNew"); - d_ddx = ctx.Output("D_DDx"); - - if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input - d_dOut, d_OutNew, d_ddx); // output - } -}; - template class SquareDoubleGradKernel : public framework::OpKernel { @@ -1952,7 +1556,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } // namespace paddle #define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ @@ -1965,8 +1568,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ - HardSigmoidGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 865943696c35aee75f5b8e48326b3d61c1e58532..7c1b288080162e2a5bf847a795fc640ab5e5e4e1 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -20,69 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaSigmoidFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // sigmoid(x) = 1 / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(one / (one + exp(-x))); - } -}; - -template -struct CudaSigmoidGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * out * (1 - out) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * out * (one - out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaLogSigmoidFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - - // logsigmoid(x) = log(1 / (1 + exp(-x))) - // For numerical stability, - // logsigmoid(x) = - // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - MPType temp = x > zero ? zero : -x; - return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); - } -}; - -template -struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - - // dx = dout * exp(-x) / (1 + exp(-x)) - // For numerical stability: - // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, - // 0))) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType temp1 = x > zero ? zero : -x; - MPType temp2 = exp(-x - temp1); - return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaCeilFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -551,49 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { } }; -template -struct CudaHardSigmoidFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - float slope; - float offset; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - // hard_sigmoid(x) = 0, when x <= -3 - // 1, when x >= 3 - // x * slope + offset, otherwise - __device__ __forceinline__ T operator()(const T x) const { - T temp = x * static_cast(slope) + static_cast(offset); - T temp_max = temp > zero ? temp : zero; - T temp_min = temp_max < one ? temp_max : one; - return temp_min; - } -}; - -template -struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - float slope; - float offset; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - // dx = (out > 0 && out < 1) ? dout * slope : 0 - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return (out > zero && out < one) ? dout * static_cast(slope) : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaSwishFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -876,6 +770,9 @@ USE_PHI_FUNCTOR(CudaSoftShrink) USE_PHI_FUNCTOR(CudaTanhShrink) USE_PHI_FUNCTOR(CudaSilu) USE_PHI_FUNCTOR(CudaELU) +USE_PHI_FUNCTOR(CudaSigmoid) +USE_PHI_FUNCTOR(CudaLogSigmoid) +USE_PHI_FUNCTOR(CudaHardSigmoid) template using CudaELUGradNegativeAlphaFunctor = @@ -954,35 +851,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== sigmoid register ============================ - */ -REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - sigmoid_grad_grad, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - sigmoid_triple_grad, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel< - plat::CUDADeviceContext, - ops::SigmoidTripleGradFunctor>); -/* ========================================================================== */ - /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, CudaSqrtGradFunctor); @@ -1120,8 +988,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ - __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ - CudaLogSigmoidGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ CudaSoftShrinkGradFunctor); \ __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ @@ -1141,8 +1007,6 @@ REGISTER_OP_CUDA_KERNEL( CudaTanhShrinkGradFunctor); \ __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \ CudaHardShrinkGradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor, \ - CudaHardSigmoidGradFunctor); \ __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \ __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index e0dfca756e14782b1f97618ef87290464834a0e7..241a80d85ead2d7bb6cd63105feb345c62a29a62 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -19,14 +19,14 @@ limitations under the License. */ namespace phi { -#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \ +#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(name) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx); -#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \ +#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(name, attr) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ @@ -34,7 +34,7 @@ namespace phi { float attr, \ DenseTensor* dx); -#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \ +#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ @@ -43,19 +43,28 @@ namespace phi { float attr2, \ DenseTensor* dx); -#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \ +#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(name) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx); -#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& out, \ - const DenseTensor& dout, \ - float attr, \ +#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx); + +#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ DenseTensor* dx); template @@ -107,28 +116,51 @@ void EluDoubleGradKernel(const Context& dev_ctx, DenseTensor* dx, DenseTensor* ddout); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu); - -DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); -DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); - -DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha) - DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold) - DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda) - DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold) - - DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max) +template +void SigmoidDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout); + +template +void SigmoidTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx); + +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sin); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asin); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atan); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid); + +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid); + +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold); + +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); + +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset); } // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index 0762ce43ff8f06bd5cc7deaf62bc3cda7d6eb81c..dbc63a636edb188e4640fdd02895868034f1dd80 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -54,6 +54,8 @@ DECLARE_ACTIVATION_KERNEL(Relu) DECLARE_ACTIVATION_KERNEL(Tanh) DECLARE_ACTIVATION_KERNEL(TanhShrink) DECLARE_ACTIVATION_KERNEL(Silu) +DECLARE_ACTIVATION_KERNEL(Sigmoid) +DECLARE_ACTIVATION_KERNEL(LogSigmoid) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold) @@ -62,5 +64,5 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max) - +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset) } // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index 11b396a84d0dee9172f0e5e70f9761fc2869fc89..c582261596221f4db8bd03599386082cee909096 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -90,6 +90,23 @@ namespace phi { dev_ctx, nullptr, &out, &dout, dx, functor); \ } +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor); @@ -103,9 +120,11 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, SigmoidGradFunctor); DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, LeakyReluGradFunctor, @@ -125,6 +144,11 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, + HardSigmoidGradFunctor, + slope, + offset); + template void EluGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -204,3 +228,8 @@ PD_REGISTER_KERNEL(tanh_triple_grad, float, double, phi::dtype::float16) {} +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel) diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 59ce18a11cc5ea13f3964faddad622e3c9344efd..1d7b77ea4445f494105d4c23516f31f349847089 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -72,6 +72,8 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, @@ -82,6 +84,10 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha) DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, + HardSigmoidFunctor, + slope, + offset) } // namespace phi PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} @@ -109,3 +115,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 663258fa560b21a86c881a8bd0446eb8e77804bb..6c5ffbd06e3a435d9568a6c4717d8ce83b5aec00 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -1012,6 +1012,217 @@ struct SiluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +// sigmoid(x) = 1 / (1 + exp(-x)) +template +struct SigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); + } +}; + +template +struct SigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out * (static_cast(1) - out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +/* + Out + DOut -> SigmoidGradGrad -> DOutNew + DDX DDOut + + DDOut = (1-Out)*Out*DDX + DOutNew = (1-2*Out)*DOut*DDX +*/ +template +struct SigmoidGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + DenseTensor* dOutNew, + DenseTensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); + + if (dOutNew) { + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); + auto dout_new = EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); + dout_new.device(*d) = + (static_cast(1) - static_cast(2) * out) * dout * ddx; + } + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); + ddout.device(*d) = (static_cast(1) - out) * out * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +/* + Out + DOut D_Dout + DDx -> SigmoidTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (1-2*Out)*DDx*D_Dout_new + D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new + D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct SigmoidTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + const DenseTensor* d_DDOut, + const DenseTensor* d_dOut_New, + DenseTensor* d_d_Out, + DenseTensor* d_Out_New, + DenseTensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); + auto d_ddOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); + auto d_dOutNew = EigenVector::Flatten(GET_DATA_SAFELY( + d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = EigenVector::Flatten(GET_DATA_SAFELY( + d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); + d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - + static_cast(2) * dout * ddx * d_dOutNew; + } + if (d_d_Out) { + auto d_dOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); + d_dOut.device(*d) = + (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); + d_ddx.device(*d) = + (static_cast(1) - out) * out * d_ddOut + + (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +// Originally: logsigmoid(x) = -log (1 + exp(-x)) +// For numerical stability, we can use the log-sum-exp trick: +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// We can rewrite the above equation as: +// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] +// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) +// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - +// max(-x, 0))) +// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) +// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) +// +// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) +// + exp(-x - max(-x, 0)))) +template +struct LogSigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); + } +}; + +// Originally: f' = exp(-x) / (1 + exp(-x)) +// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + +// exp(-x - max(-x, 0))) +template +struct LogSigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + dx.device(d) = + dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct HardSigmoidFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto temp = x * static_cast(slope) + static_cast(offset); + out.device(d) = + temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); + } +}; + +template +struct HardSigmoidGradFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((out > static_cast(0)) * (out < static_cast(1))) + .template cast() * + static_cast(slope); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaReluFunctor : public BaseActivationFunctor { @@ -1653,6 +1864,112 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct CudaSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // sigmoid(x) = 1 / (1 + exp(-x)) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(one / (one + exp(-x))); + } +}; + +template +struct CudaSigmoidGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * out * (1 - out) + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * out * (one - out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaLogSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // logsigmoid(x) = log(1 / (1 + exp(-x))) + // For numerical stability, + // logsigmoid(x) = + // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + MPType temp = x > zero ? zero : -x; + return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); + } +}; + +template +struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // dx = dout * exp(-x) / (1 + exp(-x)) + // For numerical stability: + // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, + // 0))) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType temp1 = x > zero ? zero : -x; + MPType temp2 = exp(-x - temp1); + return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaHardSigmoidFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // hard_sigmoid(x) = 0, when x <= -3 + // 1, when x >= 3 + // x * slope + offset, otherwise + __device__ __forceinline__ T operator()(const T x) const { + T temp = x * static_cast(slope) + static_cast(offset); + T temp_max = temp > zero ? temp : zero; + T temp_min = temp_max < one ? temp_max : one; + return temp_min; + } +}; + +template +struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // dx = (out > 0 && out < 1) ? dout * slope : 0 + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return (out > zero && out < one) ? dout * static_cast(slope) : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + #endif } // namespace funcs diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index b12fc6975b37d79ac9d49284b34b746d24c53681..c912d0c4686ff3fee88925f4d7121f38f24a5485 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -142,8 +142,27 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, nullptr, &out, &dout, dx, functor); \ } +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradGPUImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, CudaSigmoidGradFunctor); + DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor); @@ -157,6 +176,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor); DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, CudaLeakyReluGradFunctor, @@ -176,6 +196,11 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, + CudaHardSigmoidGradFunctor, + slope, + offset); + template void EluGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -270,3 +295,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index cd9330ead84295769244485365f0a0f06d44082e..6b598c764debb059072ba3ae3ac90e6985479133 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -91,6 +91,8 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, @@ -103,6 +105,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, + CudaHardSigmoidFunctor, + slope, + offset) } // namespace phi @@ -155,3 +161,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index a95f49c0e7cfd32802f1d1899a1fe1590fdf6a87..7d6b6dc72ea60214ff4c9974b4ff885feecb5822 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -222,4 +222,57 @@ void EluDoubleGradKernel(const Context& dev_ctx, functor(dev_ctx, &x, &ddx, ddout, &dout, dx); } +template +void SigmoidDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout) { + if (dout_new) { + dout_new->Resize(out.dims()); + dev_ctx.template Alloc(dout_new); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } + funcs::SigmoidGradGradFunctor functor; + functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout); +} + +template +void SigmoidTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx) { + if (d_dout) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_dout); + } + if (d_out_new) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_out_new); + } + if (d_ddx) { + d_dout->Resize(ddx.dims()); + dev_ctx.template Alloc(d_ddx); + } + funcs::SigmoidTripleGradFunctor functor; + functor(dev_ctx, + &out, + &ddx, + &dout, + &d_ddout, + &d_dout_new, + d_dout, + d_out_new, + d_ddx); +} + } // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 890dbadf17c81fa40f629114df47f518fdcc387b..7ae0dc45c5e1be09a31821c171b84fbb47fe1c9e 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -56,9 +56,14 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid, + "hard_sigmoid", + "slope" comma "offset"); // NOLINT KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { @@ -79,6 +84,20 @@ KernelSignature TanhTripleGradOpArgumentMapping( {"D_OutNew", "D_DOut", "D_DDx"}); } +KernelSignature SigmoidDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "sigmoid_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"}); +} + +KernelSignature SigmoidTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("sigmoid_triple_grad", + {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"}, + {}, + {"D_OutNew", "D_DOut", "D_DDx"}); +} + KernelSignature LeakyReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( @@ -114,6 +133,7 @@ PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink); PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad); PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); @@ -152,3 +172,12 @@ PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad, + phi::SigmoidDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad, + phi::SigmoidTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad, + phi::LogSigmoidGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad, + phi::HardSigmoidGradOpArgumentMapping);