From 30a85204b46141dfb313bed2f0166e95c2ffb348 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Sat, 4 Nov 2017 19:27:11 -0700
Subject: [PATCH] Adding the doc format for AdaDelta, AdaMax, Adam, AdaGrad,
 BatchNorm, Clip, Cast and AUC (#5317)

* Adding the doc format for AdaDelta

* Updating the documentation for Adagrad, Adam and Adamax

* Updating the auc op

* Fix review comments

* Updating doc for Batch Norm

* Updating the cast op

* Updating the clip op

* Fixing review comment

* Fixing review comment:

* Small change to restart PR_CI
---
 paddle/operators/adadelta_op.cc     | 34 ++++++++++++++---------------
 paddle/operators/adagrad_op.cc      | 12 ++++++----
 paddle/operators/adam_op.cc         | 29 +++++++++++-------------
 paddle/operators/adamax_op.cc       | 22 ++++++++-----------
 paddle/operators/auc_op.cc          | 31 +++++++++++++-------------
 paddle/operators/batch_norm_op.cc   | 20 ++++++++++-------
 paddle/operators/cast_op.cc         | 14 +++++++-----
 paddle/operators/clip_op.cc         |  5 ++++-
 paddle/operators/name_convention.md | 12 +++++-----
 9 files changed, 92 insertions(+), 87 deletions(-)
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index 24e419b53..b717e1647 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("AvgSquaredGrad",
-             "(Tensor) Input expectation of squared gradient");
+    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
     AddInput("AvgSquaredUpdate",
-             "(Tensor) Input expectation of squared parameter updates");
+             "(Tensor) Input average of squared parameter updates");
 
     AddOutput("ParamOut", "(Tensor) Output parameter");
     AddOutput("AvgSquaredGradOut",
-              "(Tensor) Output expectation of squared gradient");
+              "(Tensor) Output average of squared gradient");
     AddOutput("AvgSquaredUpdateOut",
-              "(Tensor) Output expectation of squared parameter updates");
+              "(Tensor) Output average of squared parameter updates");
 
     AddAttr<float>("rho",
                    "(float, default 0.95) Exponential decay rate "
@@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
                    "numerical stability")
         .SetDefault(1.0e-6f);
     AddComment(R"DOC(
-Adadelta Updates Operator.
+Adadelta Optimizer.
 
-This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
-adaptive learning rate method for gradient descent.
+Adadelta optimizer is implemented as explained in:
+https://arxiv.org/abs/1212.5701
+Adadelta is a per-dimension adaptive learning rate method used
+for gradient descent.
 
-Adadelta updates:
+Adadelta updates are as follows:
 
-avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
-param_update =  - sqrt((avg_squared_update + epsilon) /
-                       (avg_squared_grad_out + epsilon)) * grad
-avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
-param_out = param + param_update
-
-References:
-  [1] ADADELTA: An Adaptive Learning Rate Method
-      https://arxiv.org/abs/1212.5701
+$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
+paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
+                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
+avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
+                                  {(paramUpdate)}^2 \break
+paramOut = param + paramUpdate$$
 
 )DOC");
   }
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index bc081f87d..8d1a2b793 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 
 Adaptive Gradient Algorithm (Adagrad).
 
-moment_out = moment + grad * grad
-param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+The update is done as follows:
+
+$$momentOut = moment + grad * grad \break
+paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$
 
 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here for numerical stability 
-by avoiding division by zero.
+does not have the epsilon attribute. It is added here in our implementation
+as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+for numerical stability to avoid the division by zero error.
 
 )DOC");
   }
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index 3572de06b..97a091ae7 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
                       "Beta1 power accumulator should have 1 dimension");
     auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
@@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
         "Param and Grad input of AdamOp should have same dimension");
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment input of AdamOp should have same dimension");
+        "Param and Moment1 input of AdamOp should have same dimension");
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment2"),
-        "Param and InfNorm input of AdamOp should have same dimension");
+        "Param and Moment2 input of AdamOp should have same dimension");
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("Moment1Out", param_dims);
@@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1.0e-8f);
 
     AddComment(R"DOC(
-Adam Updates Operator.
+Adam Optimizer.
 
 This implements the Adam optimizer from Section 2 of the Adam
-paper[1]. Adam is a first-order gradient-based optimization
-method based on adaptive estimates of lower-order moments.
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
 
 Adam updates:
 
-moment1_out = beta1 * moment1 + (1 − beta1) * grad
-moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
-learning_rate_t = learning_rate_t *
-                  sqrt(1 - beta2_pow) / (1 - beta1_pow)
-param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
+moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
+learningRate = learningRate *
+                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 
 )DOC");
   }
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index ff2565774..14cf3841b 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "Constant for numerical stability")
         .SetDefault(1.0e-8f);
     AddComment(R"DOC(
-Adamax Updates Operator.
+Adamax Optimizer.
 
-This implements the Adamax optimizer from Section 7 of the Adam
-paper[1]. Adamax is a variant of the
+We implement the Adamax optimizer from Section 7 of the Adam
+paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
 Adam algorithm based on the infinity norm.
 
 Adamax updates:
 
-moment_out = beta1 * moment + (1 - beta1) * grad
-inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
-learning_rate_t = learning_rate/(1 - beta1_pow)
-param_out = param - learning_rate_t * moment_out/inf_norm_out
+$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
+infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
+learningRate = learningRate /(1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * momentPut / infNormOut$$
 
 The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability
-by preventing divide by 0.
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+However, it is added here for numerical stability to prevent the
+division by 0 error.
 
 )DOC");
   }
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index f5784922a..ccb969ab2 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices must be initialized.");
+                   "Input of Indices should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input of Label must be initialized.");
+                   "Input of Label should not be null.");
     auto inference_height = ctx->GetInputDim("Out")[0];
     auto label_height = ctx->GetInputDim("Label")[0];
 
@@ -52,20 +52,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Out",
              "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is descend sorted. This input should be the"
+             "Each row is sorted in descending order. This input should be the"
              "output of topk."
              "Typically, this tensor indicates the probability of each label");
     AddInput("Indices",
              "An int 2D tensor, indicating the indices of original"
-             "tensor before sort. Typically, this tensor indicates which label"
-             "the probability stands for.");
+             "tensor before sorting. Typically, this tensor indicates which "
+             "label the probability stands for.");
     AddInput("Label",
              "A 2D int tensor indicating the label of the training data."
              "The height is batch size and width is always 1.");
     // TODO(typhoonzero): support weight input
     AddOutput("AUC",
               "A scalar representing the "
-              "current area-under-curve.");
+              "current area-under-the-curve.");
 
     AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
         .SetDefault("ROC");
@@ -74,19 +74,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
                  " roc curve.")
         .SetDefault(200);
 
-    AddComment(
-        R"DOC(Computes the AUC according forward output and label.
-Best to use for binary classification evaluations.
+    AddComment(R"DOC(
+Area Under The Curve (AUC) Operator.
 
+This implementation computes the AUC according to forward output and label.
+It is used very widely in binary classification evaluation. As a note:
 If input label contains values other than 0 and 1, it will be cast
-to bool.
-
-You can find the definations here: 
+to bool. You can find the relevant definitions here:
 https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
 
-Possible curves are:
-- ROC: Receiver operating characteristic
-- PR: Precision Recall
+There are two types of possible curves:
+1. ROC: Receiver operating characteristic
+2. PR: Precision Recall
 )DOC");
   }
 };
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 9c4bfd24c..7d73dfde7 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -70,7 +70,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                              : x_dims[x_dims.size() - 1]);
 
     PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input x must have 3 to 5 dimensions.");
+                   "Input X must have 3 to 5 dimensions.");
 
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
@@ -97,16 +97,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input tensor");
     AddInput("Scale",
              "Scale is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
     AddInput("Bias",
              "Bias is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
     AddInput("Mean",
-             "The global mean (for training) or the "
+             "The global mean (for training) or "
              "estimated mean (for testing)");
     AddInput("Variance",
              "The global variance (for training) "
-             "or the estimated Variance (for testing)");
+             "or estimated Variance (for testing)");
     AddOutput("Y", "result after normalization");
     AddOutput("MeanOut",
               "Share memory with Mean. "
@@ -123,10 +123,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "will apply to output when training")
         .AsIntermediate();
     AddComment(R"DOC(
-https://arxiv.org/pdf/1502.03167.pdf
+Batch Normalization.
 
-NHWC `[batch, in_height, in_width, in_channels]`
-NCHW `[batch, in_channels, in_height, in_width]`
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
 
 )DOC");
   }
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index 19187894c..70ee7861b 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   CastOpProtoMaker(framework::OpProto *proto,
                    framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensor of cast op");
-    AddOutput("Out", "the output tensor of cast op");
-    AddComment(R"DOC(Cast operator.
-cast the input tensor to other data type.
-)DOC");
+    AddInput("X", "The input tensor of cast op");
+    AddOutput("Out", "The output tensor of cast op");
     AddAttr<int>("out_data_type", "output data type");
     AddAttr<int>("in_data_type", "input data type");
+    AddComment(R"DOC(
+Cast Operator.
+
+This Operator casts the input tensor to another data type and
+returns tha Output Tensor.
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index f80204c68..3e9066ceb 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -49,8 +49,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>(
         "max", "(float)Maximum value, above which element is replaced by max");
     AddComment(R"DOC(
-Clip operator limits the given input within an interval. The interval is
+Clip Operator.
+
+The clip operator limits the value of given input within an interval. The interval is
 specified with arguments 'min' and 'max'.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/name_convention.md b/paddle/operators/name_convention.md
index 5a2169079..62e7a6c84 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -4,10 +4,10 @@ To make the operator document itself more clear, we recommend operator names obe
 
 ### OpProtoMaker names
 
-When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator. 
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
 
 - Input/Output.
-  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words. 
+  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
   - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
 
 - Attribute.
@@ -15,7 +15,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 
 - Comments.
   - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
-  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`. 
+  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
 
 - Order.
   - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
@@ -24,7 +24,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 
 Here we give some examples to show how these rules will be used.
 
-- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`. 
+- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
 
 - The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
 
@@ -38,8 +38,8 @@ public:
   AccumulateOpMaker(framework::OpProto *proto,
                             framework::OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor. 
-    If the output size is not the same as input size, 
+    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
+    If the output size is not the same as input size,
     the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
     AddOutput("Out", "(Tensor) Accumulated output tensor");
     AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
-- 
GitLab