elu: Optimize gradient calculation;Add more comments

a815d6ab · zhouxiao-coder · a2657fea · a815d6ab · a815d6ab · a815d6ab
4 changed file
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -174,6 +174,25 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };

+template <typename AttrType>
+class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "Input of ELU operator, it shouldn't be empty. Input is flattened "
+             "and treated as a 1D array.");
+    AddOutput("Y", "Output of ELU operator, has same shape as the input.");
+    AddComment(
+        "ELU activation operator. It applies this element-wise computation on "
+        "the input: f(x) = max(0, x) + min(0, alpha * (exp(x) - 1))."
+        "Check .. _Link: https://arxiv.org/abs/1511.07289 for more details");
+    AddAttr<AttrType>("alpha",
+                      "alpha value in the elu formulation, default to 1.")
+        .SetDefault(static_cast<AttrType>(1.));
+  }
+};
+
 template <typename AttrType>
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
@@ -311,6 +330,12 @@ REGISTER_OP_CPU_KERNEL(soft_relu,
 REGISTER_OP_CPU_KERNEL(
    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::CPUPlace, float>);

+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker<float>, elu_grad,
+            ops::ActivationOpGrad);
+REGISTER_OP_CPU_KERNEL(elu, ops::ELUKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(elu_grad,
+                       ops::ELUGradKernel<paddle::platform::CPUPlace, float>);
+
 REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
            ops::ActivationOpGrad);
 REGISTER_OP_CPU_KERNEL(pow, ops::PowKernel<paddle::platform::CPUPlace, float>);

--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -97,6 +97,10 @@ REGISTER_OP_GPU_KERNEL(soft_relu,
 REGISTER_OP_GPU_KERNEL(
    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::GPUPlace, float>);

+REGISTER_OP_GPU_KERNEL(elu, ops::ELUKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(elu_grad,
+                       ops::ELUGradKernel<paddle::platform::GPUPlace, float>);
+
 REGISTER_OP_GPU_KERNEL(pow, ops::PowKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(pow_grad,
                       ops::PowGradKernel<paddle::platform::GPUPlace, float>);

--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -296,6 +296,46 @@ class SoftReluGradKernel : public framework::OpKernel<T> {
  }
 };

+template <typename Place, typename T, typename AttrType = T>
+class ELUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Output<framework::Tensor>("Y");
+    auto alpha = static_cast<T>(context.Attr<AttrType>("alpha"));
+    Y->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) =
+        x.cwiseMax(static_cast<T>(0)) +
+        (alpha * (x.exp() - static_cast<T>(1))).cwiseMin(static_cast<T>(0));
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class ELUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Y = context.Input<framework::Tensor>("Y");
+    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto alpha = static_cast<T>(context.Attr<AttrType>("alpha"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto y = framework::EigenVector<T>::Flatten(*Y);
+    auto dy = framework::EigenVector<T>::Flatten(*dY);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+    dx.device(place) =
+        dy * (x > static_cast<T>(0)).template cast<T>() +
+        dy * (y + alpha) * (x < static_cast<T>(0)).template cast<T>();
+  }
+};
+
 template <typename Place, typename T, typename AttrType = T>
 class PowKernel : public framework::OpKernel<T> {
 public:

--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -144,6 +144,26 @@ class TestSoftRelu(OpTest):
        self.check_grad(['X'], 'Y', max_relative_error=0.02)


+class TestELU(OpTest):
+    def setUp(self):
+        self.op_type = "elu"
+        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        alpha = 1.
+        # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
+        # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
+        self.inputs = {'X': x}
+        self.attrs = {'alpha': alpha}
+        self.outputs = {
+            'Y': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+
+
 class TestReciprocal(OpTest):
    def setUp(self):
        self.op_type = "reciprocal"