From 7d16fe87a30f1909cb9a8934f2a72e2064103e80 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Wed, 6 Sep 2017 16:05:29 +0800
Subject: [PATCH] refine softmax operator.

---
 paddle/operators/scale_op.cc                  | 18 ++++++++-----
 paddle/operators/softmax_op.cc                | 27 ++++++++++---------
 paddle/operators/softmax_op.h                 | 16 +++++------
 .../v2/framework/tests/test_softmax_op.py     | 21 +++++++++------
 4 files changed, 47 insertions(+), 35 deletions(-)
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 8e96a74c94..c2e005444b 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -44,11 +44,12 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 
 The equation is: Out = scale*X
 )DOC");
-    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+    AddAttr<AttrType>("scale", "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
   }
 };
 
-// Identity Op's gradient is identity op, too.
+// IdentityOp's gradient is IdentityOp, too.
 // Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
 template <typename AttrType>
 class ScaleGradOp : public NetOp {
@@ -65,17 +66,20 @@ class ScaleGradOp : public NetOp {
   }
 };
 
-// identity is a alias of scale op. This is also a example for creating a alias
-// operator.
+// IdentityOp is an alias of the ScaleOp. This is also an example for creating
+// an alias of an existing operator.
 template <typename AttrType>
 class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   IdentityOpMaker(framework::OpProto *proto,
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input tensor of identity op");
-    AddOutput("Out", "output tensor of identity op");
-    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+    AddInput("X", "The input tensor of identity op.");
+    AddOutput("Out", "The output tensor of identity op.");
+    AddComment(R"DOC(
+The identity operator is just an alias of the scale operator with the
+attribute scale is fixed to 1.0.
+)DOC");
   }
 };
 
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 7d062ad67c..f3aea6df7c 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -23,9 +23,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
+    PADDLE_ENFORCE(ctx.Input<Tensor>("logits")->dims().size() == 2UL,
                    "The input of softmax op must be a matrix.");
-    ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx.Output<Tensor>("softmax")->Resize(ctx.Input<Tensor>("logits")->dims());
   }
 };
 
@@ -34,10 +34,10 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
   SoftmaxOpMaker(framework::OpProto *proto,
                  framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
+    AddInput("logits",
              "The input tensor of softmax. "
              "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Y", "The normalized values with the same shape as X.");
+    AddOutput("softmax", "The normalized values with the same shape as X.");
     AddComment(R"DOC(
 The input of softmax operator is a 2-D tensor with shape N x K (N is the
 batch_size, K is the dimension of input feature). The output tensor has the
@@ -64,14 +64,17 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
-                            "Input(Y@GRAD) should not be null");
-    PADDLE_ENFORCE(ctx.Input<Tensor>("Y")->dims() ==
-                       ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
-                   "the shape of Input(0) and Input(1) should be the same");
-    ctx.Output<Tensor>(framework::GradVarName("X"))
-        ->Resize(ctx.Input<Tensor>("Y")->dims());
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("softmax"),
+                            "Input(softmax) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("softmax")),
+                            "Input(softmax@GRAD) should be not null.");
+    PADDLE_ENFORCE_EQ(
+        ctx.Input<Tensor>("softmax")->dims(),
+        ctx.Input<Tensor>(framework::GradVarName("softmax"))->dims(),
+        "Input(softmax) and its gradients should have a same shape.");
+
+    ctx.Output<Tensor>(framework::GradVarName("logits"))
+        ->Resize(ctx.Input<Tensor>("logits")->dims());
   }
 };
 
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 4fa6b59540..57e5bfad02 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -28,12 +28,12 @@ template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto input = context.Input<Tensor>("X");
-    auto output = context.Output<Tensor>("Y");
-    output->mutable_data<T>(context.GetPlace());
+    auto X = context.Input<Tensor>("logits");
+    auto Y = context.Output<Tensor>("softmax");
+    Y->mutable_data<T>(context.GetPlace());
 
-    auto logits = EigenMatrix<T>::From(*input);
-    auto softmax = EigenMatrix<T>::From(*output);
+    auto logits = EigenMatrix<T>::From(*X);
+    auto softmax = EigenMatrix<T>::From(*Y);
 
     const int kBatchDim = 0;
     const int kClassDim = 1;
@@ -69,9 +69,9 @@ class SoftmaxGradKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
 
-    auto Y = context.Input<Tensor>("Y");
-    auto dY = context.Input<Tensor>(framework::GradVarName("Y"));
-    auto dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto Y = context.Input<Tensor>("softmax");
+    auto dY = context.Input<Tensor>(framework::GradVarName("softmax"));
+    auto dX = context.Output<Tensor>(framework::GradVarName("logits"));
     dX->mutable_data<T>(context.GetPlace());
 
     const int batch_size = Y->dims()[0];
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py
index e670d93653..d7279df116 100644
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -18,18 +18,23 @@ class TestSoftmaxOp(unittest.TestCase):
 
     def setUp(self):
         self.type = "softmax"
-        self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
+        self.inputs = {"logits": np.random.random((10, 10)).astype("float32")}
         self.outputs = {
-            'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+            "softmax":
+            np.apply_along_axis(stable_softmax, 1, self.inputs["logits"])
         }
 
 
-class SoftmaxGradOpTest(GradientChecker):
-    def test_softmax(self):
-        op = create_op("softmax")
-        inputs = {"X": np.random.uniform(0.1, 1, [10, 10]).astype("float32")}
-        self.check_grad(op, inputs, set("X"), "Y")
+class TestSoftmaxGradOp(GradientChecker):
+    def setUp(self):
+        self.op = create_op("softmax")
+        self.inputs = {
+            "logits": np.random.uniform(0.1, 1, [10, 10]).astype("float32")
+        }
+
+    def test_softmax_grad(self):
+        self.check_grad(self.op, self.inputs, ["logits"], "softmax")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
-- 
GitLab