diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 055909ba6f486ff82220c2d36c54687091bde9ed..47618114a85ff1cd4f9455793ca4c63afe260558 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -789,6 +789,27 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { } }; +template +class SigmoidDoubleGradMaker + : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("sigmoid_grad_grad"); + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + op->SetAttrMap(this->Attrs()); + // output: ddy + op->SetOutput("DOutNew", this->InputGrad("Out")); + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + template class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { public: @@ -1068,6 +1089,47 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +/* ========================== sigmoid register ============================= + */ +// 1. Register Sigmoid Operator +REGISTER_OPERATOR( + sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, + ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + std::conditional>(), + ops::ActFwdInplaceInferer, void>::type); + +// 2. Register Sigmoid Grad Operator +REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer, + ops::SigmoidDoubleGradMaker, + ops::SigmoidDoubleGradMaker) + +// 3. Register Sigmoid DoubleGrad Operator +REGISTER_OPERATOR( + sigmoid_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); + +// Register Sigmoid/GradSigmoid Kernels +REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor, + SigmoidGradFunctor); + +// Register DoubleGrad Kernel +REGISTER_OP_CPU_KERNEL( + sigmoid_grad_grad, + ops::SigmoidDoubleGradKernel>, + ops::SigmoidDoubleGradKernel>, + ops::SigmoidDoubleGradKernel>); + +/* ========================================================================== */ + /* ========================== tanh register ============================= */ REGISTER_OPERATOR( tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType, diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 87e65e8817798199c907f88692887a21da58673c..c94510c9dfe5235187565a4178216503fcef6275 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1481,6 +1481,21 @@ REGISTER_OP_CUDA_KERNEL( #endif /* ========================================================================== */ +/* =========================== sigmoid register ============================ + */ +REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, + CudaSigmoidGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + sigmoid_grad_grad, + ops::SigmoidDoubleGradKernel>, + ops::SigmoidDoubleGradKernel>, + ops::SigmoidDoubleGradKernel>); +/* ========================================================================== */ + /* =========================== tanh register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, CudaTanhGradFunctor); @@ -1595,7 +1610,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ - __macro(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor); \ __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ CudaLogSigmoidGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index ccd5bf528ba58ca731513a1a1fafce3f2f64c470..3bdf3f34721b039b1a64794c1b9932e1c7d8e834 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -258,6 +258,43 @@ struct SigmoidGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +/* + Out + DOut -> SigmoidGradGrad -> DOutNew + DDX DDOut + + DDOut = (1-Out)*Out*DDX + DOutNew = (1-2*Out)*DOut*DDX +*/ +template +struct SigmoidGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + framework::Tensor* dOutNew, framework::Tensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); + + if (dOutNew) { + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); + auto dout_new = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + dout_new.device(*d) = + (static_cast(1) - static_cast(2) * out) * dout * ddx; + } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + ddout.device(*d) = (static_cast(1) - out) * out * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + // silu(x) = x / (1 + exp(-x)) template struct SiluFunctor : public BaseActivationFunctor { @@ -1789,6 +1826,50 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } +template +class SigmoidDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut; + framework::Tensor *dOutNew, *ddOut; + Out = ddX = dOut = nullptr; + dOutNew = ddOut = nullptr; + + // extract ddx(input) and out(input) + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + PADDLE_ENFORCE_NOT_NULL( + ddX, platform::errors::NotFound( + "Cannot get input Variable ddX, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + Out, platform::errors::NotFound( + "Cannot get input Variable Out, variable name = %s", + ctx.InputName("Out"))); + + // set output ddout + ddOut = ctx.Output("DDOut"); + + // extract dOut(intput) + dOut = ctx.Input("DOut"); + PADDLE_ENFORCE_NOT_NULL( + dOut, platform::errors::NotFound( + "Cannot get input Variable dOut, variable name = %s", + ctx.InputName("DOut"))); + + // set output dout_new + dOutNew = ctx.Output("DOutNew"); + + if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, dOutNew, ddOut); + } +}; + template class TanhDoubleGradKernel : public framework::OpKernel { @@ -2153,7 +2234,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } // namespace paddle #define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 6c35d445b43b7beec5d4d58d29adecffc9a0325c..81b3e9bf34887e07b7472aa516c1da90242002d8 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -26,6 +26,28 @@ import gradient_checker from decorator_helper import prog_scope +class TestSigmoidDoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 0.0005 + dtype = np.float64 + x = layers.data('x', shape, False, dtype=dtype) + x.persistable = True + y = layers.sigmoid(x) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + x_arr[np.abs(x_arr) < 0.005] = 0.002 + gradient_checker.double_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + class TestTanhDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place):