diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 9978fc3c41b878229bda62994df618e00f524337..cf21e26cd87be4689a19627214692f9dbb1face1 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -764,6 +764,31 @@ class LeakyReluDoubleGradMaker } }; +// elu grad: dx=dy if y>0 else alpha*dy*x.exp() +// elu gradgrad: ddx=ddy if y>0 else alpha*ddy*x.exp() +template +class ELUDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new T(); + op->SetType("elu_grad_grad"); + + op->SetInput("X", this->Input("X")); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + // X@GRAD@GRAD: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetAttrMap(this->Attrs()); + + // Out@GRAD@GRAD: ddy + op->SetOutput("DX", this->InputGrad("X")); + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + return std::unique_ptr(op); + } +}; + // sqrt Grad: dx = 0.5 * dy / y // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx template @@ -984,6 +1009,34 @@ REGISTER_OP_CPU_KERNEL( plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor>); /* ========================================================================== */ +/* ======================== elu register ============================ */ +REGISTER_OPERATOR( + elu, ops::ActivationOp, ops::ELUOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + ops::ActFwdInplaceInferer); +REGISTER_OPERATOR(elu_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInference, + ops::ELUDoubleGradMaker, + ops::ELUDoubleGradMaker); +REGISTER_OPERATOR( + elu_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInference); + +REGISTER_ACTIVATION_CPU_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor); +REGISTER_OP_CPU_KERNEL( + elu_grad_grad, ops::ELUDoubleGradKernel>, + ops::ELUDoubleGradKernel>, + ops::ELUDoubleGradKernel>); + +/* ========================================================================== */ + /* =========================== sqrt register ============================= */ REGISTER_OPERATOR( sqrt, ops::ActivationOp, ops::SqrtOpMaker, ops::ActivationOpInferVarType, diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 863e960c6a828c51db692285b45fae6150c4cf31..48ec90471f0becf921e7e68eb8722544885aaa7a 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -47,6 +47,18 @@ REGISTER_OP_CUDA_KERNEL( plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor>); /* ========================================================================== */ +/* ======================== elu register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + elu_grad_grad, ops::ELUDoubleGradKernel>, + ops::ELUDoubleGradKernel>, + ops::ELUDoubleGradKernel>); +/* ========================================================================== */ + /* =========================== relu register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index bc01943060da9a7252bc2289f3856900a0b881b3..fa6ec23ce83a21bff67d449ae8d4f2b1f2989ec1 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -1084,7 +1084,7 @@ struct ELUGradFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out, dOut dout, dX dx) const { dx.device(d) = dout * (x > static_cast(0)).template cast() + dout * static_cast(alpha) * x.exp() * - (x < static_cast(0)).template cast(); + (x <= static_cast(0)).template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -1405,6 +1405,39 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +template +struct ELUGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, const framework::Tensor* X, + const framework::Tensor* ddX, framework::Tensor* ddOut, + const framework::Tensor* dOut, framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); + auto x = framework::EigenVector::Flatten(detail::Ref(X)); + + if (dX) { + auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); + auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); + dx.device(*d) = ddx * dout * static_cast(alpha) * x.exp() * + (x < static_cast(0)).template cast(); + } + + if (ddOut) { + auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); + ddout.device(*d) = ddx * + ((x > static_cast(0)).template cast() + + static_cast(alpha) * x.exp() * + (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + template struct SqrtGradGradFunctor : public BaseActivationFunctor { template @@ -1515,6 +1548,33 @@ class SquareDoubleGradKernel } }; +template +class ELUDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *X, *ddX, *dOut; + X = ddX = dOut = nullptr; + framework::Tensor *dX, *ddOut; + dX = ddOut = nullptr; + + ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut); + + if (dX) dX->mutable_data(X->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(ctx.GetPlace()); + + auto& place = ctx.template device_context(); + + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + functor(place, X, ddX, ddOut, dOut, dX); + } +}; + template class SqrtDoubleGradKernel : public framework::OpKernel { @@ -1688,7 +1748,6 @@ class PowGradKernel __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ - __macro(elu, ELU, ELUFunctor, ELUGradFunctor); \ __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \ __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ HardSigmoidGradFunctor); \ diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 733643287cea27edf7c39be6179d7c259d287034..3e6d6e5ea859a98a95081b524ab5259f9b4d1e8b 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -75,6 +75,30 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase): self.func(p) +class TestELUDoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 0.0001 + alpha = 1.1 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + x.persistable = True + + y = layers.elu(x, alpha=alpha) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + gradient_checker.double_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + class TestSqrtDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place):