Polish Operator Doc (m) (#5375)

* fix m_ops * fix activation op

Polish Operator Doc (m) (#5375)
* fix m_ops * fix activation op
cb0118f3 · kexinzhao · Yi Wang · 5d8cdf20 · cb0118f3 · cb0118f3
9 changed file
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -44,7 +44,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Sigmoid operator");
    AddOutput("Y", "Output of Sigmoid operator");
    AddComment(R"DOC(
-Sigmoid activation operator.
+Sigmoid Activation Operator.

 $y = 1 / (1 + e^{-x})$

@@ -60,7 +60,7 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of LogSigmoid operator");
    AddOutput("Y", "Output of LogSigmoid operator");
    AddComment(R"DOC(
-Logsigmoid activation operator.
+Logsigmoid Activation Operator.

 $y = \log(1 / (1 + e^{-x}))$

@@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Exp operator");
    AddOutput("Y", "Output of Exp operator");
    AddComment(R"DOC(
-Exp activation operator.
+Exp Activation Operator.

 $y = e^x$

@@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Relu operator");
    AddOutput("Y", "Output of Relu operator");
    AddComment(R"DOC(
-Relu activation operator.
+Relu Activation Operator.

 $y = \max(x, 0)$

@@ -109,7 +109,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("alpha", "The small negative slope")
        .SetDefault(static_cast<AttrType>(0.02f));
    AddComment(R"DOC(
-LeakyRelu activation operator.
+LeakyRelu Activation Operator.

 $y = \max(x, \alpha * x)$

@@ -128,7 +128,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("lambda", "non-negative offset")
        .SetDefault(static_cast<AttrType>(0.5f));
    AddComment(R"DOC(
-Softshrink activation operator.
+Softshrink Activation Operator.

 $$
 y = \begin{cases} 
@@ -149,7 +149,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Y", "Output of Tanh operator");
    AddComment(R"DOC(
-Tanh activation operator.
+Tanh Activation Operator.

 $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

@@ -165,7 +165,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of TanhShrink operator");
    AddOutput("Y", "Output of TanhShrink operator");
    AddComment(R"DOC(
-TanhShrink activation operator.
+TanhShrink Activation Operator.

 $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

@@ -184,7 +184,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
        .SetDefault(static_cast<AttrType>(0.5));
    AddComment(R"DOC(
-HardShrink activation operator.
+HardShrink Activation Operator.

 $$
 y = \begin{cases} 
@@ -205,7 +205,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Y", "Output of Sqrt operator");
    AddComment(R"DOC(
-Sqrt activation operator.
+Sqrt Activation Operator.

 $y = \sqrt{x}$

@@ -220,7 +220,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Abs operator");
    AddOutput("Y", "Output of Abs operator");
    AddComment(R"DOC(
-Abs activation operator.
+Abs Activation Operator.

 $y = |x|$

@@ -236,7 +236,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Reciprocal operator");
    AddOutput("Y", "Output of Reciprocal operator");
    AddComment(R"DOC(
-Reciprocal activation operator.
+Reciprocal Activation Operator.

 $$y = \frac{1}{x}$$

@@ -251,7 +251,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Log operator");
    AddOutput("Y", "Output of Log operator");
    AddComment(R"DOC(
-Log activation operator.
+Log Activation Operator.

 $y = \ln(x)$

@@ -268,7 +268,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Square operator");
    AddOutput("Y", "Output of Square operator");
    AddComment(R"DOC(
-Square activation operator.
+Square Activation Operator.

 $y = x^2$

@@ -284,7 +284,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Softplus operator");
    AddOutput("Y", "Output of Softplus operator");
    AddComment(R"DOC(
-Softplus activation operator.
+Softplus Activation Operator.

 $y = \ln(1 + e^{x})$

@@ -300,7 +300,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Softsign operator");
    AddOutput("Y", "Output of Softsign operator");
    AddComment(R"DOC(
-Softsign activation operator.
+Softsign Activation Operator.

 $$y = \frac{x}{1 + |x|}$$

@@ -320,7 +320,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
        .SetDefault(static_cast<AttrType>(24));
    AddComment(R"DOC(
-BRelu activation operator.
+BRelu Activation Operator.

 $y = \max(\min(x, t_{min}), t_{max})$

@@ -339,7 +339,7 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
        .SetDefault(static_cast<AttrType>(40));
    AddComment(R"DOC(
-SoftRelu activation operator.
+SoftRelu Activation Operator.

 $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$

@@ -357,7 +357,7 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("alpha", "The alpha value of ELU")
        .SetDefault(static_cast<AttrType>(1.0f));
    AddComment(R"DOC(
-ELU activation operator.
+ELU Activation Operator.

 Applies the following element-wise computation on the input according to
 https://arxiv.org/abs/1511.07289.
@@ -378,7 +378,7 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
        .SetDefault(static_cast<AttrType>(6));
    AddComment(R"DOC(
-Relu6 activation operator.
+Relu6 Activation Operator.

 $y = \min(\max(0, x), 6)$

@@ -396,7 +396,7 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("factor", "The exponential factor of Pow")
        .SetDefault(static_cast<AttrType>(1));
    AddComment(R"DOC(
-Pow activation operator.
+Pow Activation Operator.

 $y = x^{factor}$

@@ -416,7 +416,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
        .SetDefault(static_cast<AttrType>(1.7159));
    AddComment(R"DOC(
-STanh activation operator.
+STanh Activation Operator.

 $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$

@@ -435,7 +435,7 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold location of activation")
        .SetDefault(static_cast<AttrType>(1.0));
    AddComment(R"DOC(
-ThresholdedRelu activation operator.
+ThresholdedRelu Activation Operator.

 $$
 y = \begin{cases} 
@@ -461,7 +461,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
        .SetDefault(static_cast<AttrType>(0.5));
    AddComment(R"DOC(
-HardSigmoid activation operator.
+HardSigmoid Activation Operator.

 Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
 which is much faster than sigmoid.

--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -55,8 +55,6 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
             "(2-D tensor with shape [batch_size x 1]) "
             "The label indicating X1 ranked higher than X2 or not, "
             "can only be +1 or -1.");
-    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
-        .SetDefault(static_cast<T>(0));
    AddOutput("Activated",
              "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
              "to indicate whether each element of Output(Out) is activated.")
@@ -64,23 +62,26 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out",
              "(2-D tensor with shape [batch_size x 1]) "
              "The output loss of MarginRankLoss operator.");
+    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
+        .SetDefault(static_cast<T>(0));
    AddComment(R"DOC(
+MarginRankLoss Operator.

-MarginRankLoss operator measures the loss given a pair of training sample
+This operator measures the loss given a pair of training sample
 {`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2`, otherwise `Label = -1`. The loss 
-turns out
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+is calculated as:

-loss(X1, X2, Label) = max(0, -Label * (X1 - X2) + margin).
+$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$

-The attribute `margin` involved here helps make the predictions more robust.
+The attribute `margin` here helps make the predictions more robust.
 Denote the item ranked higher as the positive sample, otherwise the negative 
 sample. If the score of the two samples satisfies 

-positive sample - negative sample < margin,
+$positive sample - negative sample < margin$

-the pair of samples will contribute to the final loss, which will backpropogate 
-and train the ranking model to enlarge the difference of the two score.
+the pair of samples will contribute to the final loss, which will backpropagate 
+and train the ranking model to enlarge the difference between the two scores.

 For batch input with size `batch_size`, `X1`, `X2` and `Label`
 all have the same shape [batch_size x 1].

--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -144,7 +144,10 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
        )DOC")
        .SetDefault(false);
    AddComment(R"DOC(
-The MatMul operator is used to perform (batched) matrix multiplication
+MatMul Operator.
+
+
+This operator is used to perform (batched) matrix multiplication
 over the last two dimensions of the input tensors `X` and `Y`.

 If a transpose flag is specified, the last two dimensions of the
@@ -166,7 +169,8 @@ The differences are:
 - We add `transpose_X` and `transpose_Y` flags.

 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
  }
 };

--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -36,7 +36,11 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
    AddOutput("Out", "The output of mean op");
-    AddComment(R"DOC( Mean Operator
+    AddComment(R"DOC(
+Mean Operator.
+
+Out is a scalar which is the mean of all elements in X. 
+
 )DOC");
  }
 };

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -52,14 +52,16 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Y", "The right tensor of minus operator.");
    AddOutput("Out", "The output tensor of minus operator.");

-    AddComment(R"DOC(Minus Operator
+    AddComment(R"DOC(
+Minus Operator.

 Equation:

-    Out = X - Y
+    $Out = X - Y$

 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
  }
 };

--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -43,27 +43,35 @@ class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
                           framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
-             "The input tensor of modified huber loss op."
+             "The input tensor of modified huber loss op. "
             "X is 2-D tensor with shape [batch_size, 1].");
    AddInput("Y",
-             "The target labels of modified huber loss op."
-             "The shape of Y is same as X. Values of Y must be 0 or 1.");
+             "The target labels of modified huber loss op. "
+             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
    AddOutput("IntermediateVal",
              "Variable to save intermediate result which will be reused in "
              "backward processing.")
        .AsIntermediate();
    AddOutput("Out", "Classification loss for X.");
    AddComment(R"DOC(
-Modified huber loss is used in binary classification problem. The shape of
-input X and target Y are both [N, 1] and so is the shape of output loss.
-Since target Y is not differentiable, cacluating gradient for Y is illegal.
-The formulation of modified huber loss is:
-
-L(y, f(x)) = max(0, 1 - yf(x))^2  for yf(x) >= -1,
-             -4yf(x)              otherwise.
-
-Make sure the values of target label Y are in {0, 1} here. The operator will
+Modified Huber Loss Operator.
+
+This operator is used in binary classification problem. The shape of
+input X and target Y are both [N, 1] and so is the shape of the output loss.
+Since target Y is not differentiable, calculating gradient for Y is illegal.
+The formula of modified huber loss is:
+
+$$
+L(y, f(x)) = 
+\begin{cases}
+(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
+             -4yf(x),    \quad \text{otherwise}
+\end{cases}
+$$
+
+Make sure the values of target label Y are in {0, 1} here. This operator will
 scale values of Y to {-1, +1} when computing losses and gradients.
+
 )DOC");
  }
 };

--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -75,17 +75,23 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("VelocityOut", "(Tensor) Output updated velocity");

    AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("useNesterov", "(bool) Use Nesterov Momentum")
+    AddAttr<bool>("useNesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum")
        .SetDefault(false);
    AddComment(R"DOC(
-
-Momentum Algorithm with a flag for Nestrov Moemntum (momentum).
-
-velocity = mu * velocity + gradient
-if (use_nesterov):
-  param = param - gradient * learning_rate + mu * velocity * learning_rate
-else:
-  param = param - learning_rate * velocity
+Momentum Optimizer.
+
+This optimizer has a flag for Nestrov Momentum.
+The update equations are as follows:
+
+$$
+velocity = mu * velocity + gradient \\
+if (use\_nesterov):   \\
+  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+else:   \\
+  param = param - learning\_rate * velocity. \\
+$$

 )DOC");
  }

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -78,6 +78,7 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "The output of mul op");
    AddAttr<int>(
        "x_num_col_dims",
+        "(int, default 1) "
        R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
            in that case, tensors will be reshaped to a matrix. The matrix's first
            dimension(column length) will be the product of tensor's last
@@ -88,20 +89,24 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
        .EqualGreaterThan(1);
    AddAttr<int>(
        "y_num_col_dims",
+        "(int, default 1) "
        R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
             in that case, tensors will be reshaped to a matrix. Just like input `X`.
        )DOC")
        .SetDefault(1)
        .EqualGreaterThan(1);
    AddComment(R"DOC(
-Mul operator is used to perform matrix multiplication for input X and Y.
+Mul Operator. 
+
+This operator is used to perform matrix multiplication for input X and Y.

 The equation is:

-    Out = X * Y
+    $$Out = X * Y$$

 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
  }
 };

--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -66,7 +66,8 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "The candidate tensors of multiplex operator.")
        .AsDuplicable();
    AddOutput("Out", "The output tensor of multiplex operator.");
-    AddComment(R"DOC(Multiplex operator
+    AddComment(R"DOC(
+Multiplex Operator.

 Multiplex multiple tensors according to the index provided by the index tensor.

@@ -77,10 +78,11 @@ the (Ids[i])-th tensor.

 For i-th row of the output tensor:

-y[i] = x_{k}[i]
+$$y[i] = x_{k}[i]$$

-where y is the output tensor. `x_{k}` is the k-th input tensor
+where `y` is the output tensor, `x_{k}` is the k-th input tensor,
 and `k = Ids[i]`.
+
 )DOC");
  }
 };