polish_g_to_l (#5367)

c0d2ca54 · kexinzhao · Yi Wang · af760eac · c0d2ca54 · c0d2ca54
11 changed file
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -67,11 +67,28 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The source input of gather op");
    AddInput("Index", "The index input of gather op");
-    AddOutput("Out", "The output of add op");
+    AddOutput("Out", "The output of gather op");
    AddComment(R"DOC(
-Gather Operator by selecting from the first axis,
+Gather Operator.
+
+$Out = X[Index]$
+
+Out is obtained by gathering entries of the outer-most dimension 
+of X indexed by Index and concatenate them together.
+
+Example:
+
+X = [[1, 2],
+     [3, 4],
+     [5, 6]]
+
+Index = [[1, 2]]
+
+Then:
+
+Out = [[3, 4],
+       [5, 6]]

-Out = X[Index]
 )DOC");
  }
 };

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -68,21 +68,35 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  GaussianRandomOpMaker(framework::OpProto* proto,
                        framework::OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "output matrix of random op");
-    AddComment(R"DOC(
-GaussianRandom operator.
-Use to initialize tensor with gaussian random generator.
-)DOC");
+    AddOutput("Out", "Output matrix of gaussian random op");

-    AddAttr<std::vector<int>>("shape", "The dimension of random tensor.");
-    AddAttr<float>("mean", "mean of random tensor.").SetDefault(.0f);
-    AddAttr<float>("std", "std of random tensor.").SetDefault(1.0f);
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "The dimension of random tensor.");
+    AddAttr<float>("mean",
+                   "(float, default 0.0) "
+                   "mean of random tensor.")
+        .SetDefault(.0f);
+    AddAttr<float>("std",
+                   "(float, default 1.0) "
+                   "std of random tensor.")
+        .SetDefault(1.0f);
    AddAttr<int>("seed",
+                 "(int, default 0) "
                 "Random seed of generator."
-                 "0 means use system wide seed")
+                 "0 means use system wide seed.")
        .SetDefault(0);
-    AddAttr<int>("data_type", "output data type")
+    AddAttr<int>("data_type",
+                 "(int, default 5(FP32)) "
+                 "Output data type.")
        .SetDefault(framework::DataType::FP32);
+
+    AddComment(R"DOC(
+GaussianRandom Operator.
+
+Used to initialize tensors with gaussian random generator.
+
+)DOC");
  }
 };


--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -80,19 +80,21 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("HiddenPrev",
             "(Tensor) Matrix with shape [batch_size, frame_size] for the "
             "states of previous time step.");
-    AddInput("Weight",
-             "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
-             "The elements continuous in memory can be divided into two parts. "
-             "The first part are weights of the update gate and reset gate "
-             "with shape [frame_size, frame_size * 2], and the second part are "
-             "weights of output candidate with shape [frame_size, frame_size]");
-    AddInput("Bias",
-             "(Tensor) Bias vector with shape [1, frame_size * 3] concating "
-             "bias of the update gate, reset gate and output candidate.")
+    AddInput(
+        "Weight",
+        "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
+        "The elements continuous in memory can be divided into two parts. "
+        "The first part are weights of the update gate and reset gate "
+        "with shape [frame_size, frame_size * 2], and the second part are "
+        "weights of output candidate with shape [frame_size, frame_size].");
+    AddInput(
+        "Bias",
+        "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating "
+        "bias of the update gate, reset gate and output candidate.")
        .AsDispensable();
    AddOutput("Gate",
              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
-              "output of update gate, reset gate and output candidate")
+              "output of update gate, reset gate and output candidate.")
        .AsIntermediate();
    AddOutput("ResetHiddenPrev",
              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
@@ -112,16 +114,19 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(sigmoid)
        .InEnum({identity, sigmoid, tanh, relu});
    AddComment(R"DOC(
-GRUUnitOp implements part calculations of the GRU unit as following:
+GRUUnit Operator.

-\f[
-update \ gate: u_t = actGate(xu_t + W_u * hidden_prev + bias_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * hidden_prev + bias_r)  \\
-output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, hidden_prev) + bias_c) \\
-output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_prev)
-\f]
+This operator implements partial calculations of the GRU unit as follows:
+
+$$
+update \ gate: u_t = actGate(xu_t + W_u * hidden_{prev} + bias_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * hidden_{prev} + bias_r)  \\
+output \ candidate: {h}_t = actNode({xc}_t + W_c * dot(r_t, hidden_{prev}) + bias_c) \\
+output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_{prev})
+$$

 The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+
 )DOC");
  }
 };

--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -59,10 +59,12 @@ class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
              "The shape is same as Input(X) and will be reused in backward.")
        .AsIntermediate();
    AddOutput("Out",
-              "The output tensor with shape [batch_size, 1] which represents "
-              "the huber loss.");
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the huber loss.");
    AddAttr<AttrType>("delta", "Hyper parameter in huber loss.");
    AddComment(R"DOC(
+HuberLoss Operator.
+
 Huber loss is a loss function used in robust regression. We define X as the
 input value and Y as the target value. Huber loss can evaluate the fitness of
 X to Y. Different from MSE loss, Huber loss is more robust for outliers. The

--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -39,14 +39,18 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input tensor of increment operator");
    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
-    AddComment(R"DOC(Increment operator
-
-The equation is: Out = X + step
-)DOC");
    AddAttr<AttrType>("step",
+                      "(float, default 1.0) "
                      "The step size by which the "
                      "input tensor will be incremented.")
        .SetDefault(1.0);
+    AddComment(R"DOC(
+Increment Operator.
+
+The equation is: 
+$$Out = X + step$$
+
+)DOC");
  }
 };


--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -57,7 +57,7 @@ L1 Norm Operator.

 Computes the L1 norm of a tensor.

-Out = sum (abs(X))
+$$Out = \sum{|X|}$$

 )DOC");
  }

--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -115,14 +115,18 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  LoadOpProtoMaker(framework::OpProto *proto,
                   framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "The tensor need to be loaded");
-    AddComment(R"DOC(Load Operator
-Load operator will load a tensor variable from disk file.
-)DOC");
+    AddOutput("Out", "(Tensor) The tensor need to be loaded");
    AddAttr<std::string>("file_path",
+                         "(string) "
                         "Variable will be loaded from \"file_path\".")
        .AddCustomChecker(
            [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+Load Operator.
+
+Load operator will load a tensor variable from disk file.
+
+)DOC");
  }
 };
 }  // namespace operators

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -53,21 +53,27 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                     framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("W",
-             "An input represents embedding tensors,"
-             " which is a learnable parameter.");
+             "An input represents embedding tensors, "
+             "which is a learnable parameter.");
    AddInput("Ids",
-             "An input with type int32 or int64"
-             "contains the ids to be looked up in W."
-             "Ids must be a column vector with rank = 2."
-             "The 2nd dimension size must be 1");
-    AddOutput("Out", "The lookup results, which have the same type with W.");
-    AddAttr<bool>("is_sparse", "Sparse update").SetDefault(false);
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "Ids must be a column vector with rank = 2. "
+             "The 2nd dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<bool>("is_sparse",
+                  "(boolean, default false) "
+                  "Sparse update")
+        .SetDefault(false);
    AddComment(R"DOC(
+Lookup Table Operator.
+
 This operator is used to perform lookups on the parameter W,
 then concatenated into a dense tensor.

-The input `Ids` can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD with input `Ids`.
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
 )DOC");
  }
 };

--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -45,72 +45,70 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", R"DOC(
- (Tensor) The input of LRN operator. It must be a 4D tenor with NCHW format.
- )DOC");
-
+    AddInput("X",
+             "(Tensor) The input of LRN operator. "
+             "It must be a 4D tenor with NCHW format.");
    AddOutput("Out",
              "(Tensor) The output of LRN operator, which is also the 4D "
              "tensor with NCHW format.");
-    AddOutput("MidOut", R"Doc(
-(Tensor)Middle result of lrn op.It's computed in forward process 
-and also used in backward process.
-    )Doc");
-
-    AddAttr<int>("n", R"DOC(
-(int, default 5)n is “adjacent” kernel maps at the same spatial position.
-        )DOC")
+    AddOutput("MidOut",
+              "(Tensor) Middle result of LRN operator. It's computed in "
+              "forward process and also used in backward process.");
+
+    AddAttr<int>("n",
+                 "(int default 5) "
+                 "n is the \"adjacent\" kernel that maps "
+                 "at the same spatial position.")
        .SetDefault(5)
        .GreaterThan(0);

-    AddAttr<T>("k", R"DOC(
-(float, default 2.0)k is the bias.
-        )DOC")
+    AddAttr<T>("k",
+               "(float, default 2.0) "
+               "k is the bias.")
        .SetDefault(2.0)
        .GreaterThan(0.0);

-    AddAttr<T>("alpha", R"DOC(
-(float, default 0.0001)alpha is the scale number.
-        )DOC")
+    AddAttr<T>("alpha",
+               "(float, default 0.0001) "
+               "alpha is the scale number.")
        .SetDefault(0.0001)
        .GreaterThan(0.0);

-    AddAttr<T>("beta", R"DOC(
-(float, default 0.75)beta is the power number.
-        )DOC")
+    AddAttr<T>("beta",
+               "(float, default 0.75) "
+               "beta is the power number.")
        .SetDefault(0.75)
        .GreaterThan(0.0);

    AddComment(R"DOC(
- Local Response Normalization.
-
- This Function comes from the paper
- "ImageNet Classification with Deep Convolutional Neural Networks".
+Local Response Normalization Operator.

- The original formula is:
+This operator comes from the paper
+"ImageNet Classification with Deep Convolutional Neural Networks".

-                                Input(i, x, y)
- Output(i, x, y) = ----------------------------------------------
-                                 -- upper
-                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
-                                 -- j = lower
+The original formula is:

- upper is `min(C, c + n/2)`
- lower if `max(0, c - n/2)`
+$$
+Output(i, x, y) = Input(i, x, y) / \left(
+k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
+(Input(j, x, y))^2
+\right)^{\beta}
+$$

- Function implementation:
+Function implementation:

- inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
- And the meaning of each dimension(0-3) is respectively batch size,
- feature maps, rows and columns.
+Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4.
+And dimensions 0 ~ 3 represent batch size, feature maps, rows,
+and columns, respectively.

- Input and Output in the above formula is for each map(i) of one image, and
- Input(i, x, y), Output(i, x, y) represents an element in an image.
+Input and Output in the formula above is for each map(i) of one image, and
+Input(i, x, y), Output(i, x, y) represents an element in an image.

- C is the number of feature maps of one image, and n is a hyper-parameters
- is configured when Function is initialized. The sum in the denominator
- is the sum of the same position in the neighboring maps.
-    )DOC");
+C is the number of feature maps of one image. n is a hyper-parameter
+configured when operator is initialized. The sum in the denominator
+is the sum of the same positions in the neighboring maps.
+    
+)DOC");
  }
 };


--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -103,7 +103,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("H0",
             "(Tensor, optional) the initial hidden state is an optional "
             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size, D is the hidden size.")
+             "batch size and D is the hidden size.")
        .AsDispensable();
    AddInput("C0",
             "(Tensor, optional) the initial cell state is an optional "
@@ -134,85 +134,82 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("BatchGate",
              "(LoDTensor) This LoDTensor contains input gate, forget gate "
              "and output gate after the nonlinear computation. This "
-              "LoDTensor has the same shape with the reorganized input, which "
+              "LoDTensor has the same shape as the reorganized input, which "
              "is also be called batch input. The LoD size is 2. The first "
              "LoD is the batch offsets and the second LoD contains the "
              "indexes, which denote the position of reorganized sequence "
              "in the raw input.")
        .AsIntermediate();
    AddOutput("BatchCellPreAct",
-              "(LoDTensor) This LoDTensor is got in the forward and used "
+              "(LoDTensor) This LoDTensor is obtained in the forward and used "
              "in the backward.")
        .AsIntermediate();
    AddAttr<bool>("usePeepholes",
-                  "(bool, defalut: True) "
+                  "(bool, default True) "
                  "whether to enable diagonal/peephole connections.")
        .SetDefault(true);
    AddAttr<bool>("isReverse",
-                  "(bool, defalut: False) "
+                  "(bool, default False) "
                  "whether to compute reversed LSTM.")
        .SetDefault(false);
    AddAttr<std::string>(
        "gateActivation",
-        "(string, default: sigmoid)"
+        "(string, default sigmoid)"
        "The activation for input gate, forget gate and output "
        "gate, `sigmoid` by default.")
        .SetDefault("sigmoid");
    AddAttr<std::string>("cellActivation",
-                         "(string, default: tanh)"
+                         "(string, default tanh)"
                         "The activation for cell output, `tanh` by defalut.")
        .SetDefault("tanh");
    AddAttr<std::string>("candidateActivation",
-                         "(string, default: tanh)"
+                         "(string, default tanh)"
                         "The activation for candidate hidden state, "
                         "`tanh` by default.")
        .SetDefault("tanh");
-    AddComment(R"DOC(Long-Short Term Memory (LSTM) Operator
+    AddComment(R"DOC(
+Long-Short Term Memory (LSTM) Operator.

-The defalut implementation is diagonal/peephole connection [1], the formula is
-as follows
+The defalut implementation is diagonal/peephole connection 
+(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:

-    i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\

-    f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
+f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\

-    \tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
+\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\

-    o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
+o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\

-    c_t = f_t ⊙ c_{t-1} + i_t ⊙ \tilde{c_t}
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\

-    h_t = o_t ⊙ act_h(c_t)
+h_t = o_t \odot act_h(c_t)
+$$

 where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix
 of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$
-are diagonal weight matrices for peephole connections. In our implenmention,
-We use vectors to reprenset these diagonal weight matrices. The b terms
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
 denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$
-is the non-line actications, such as logistic sigmoid function, and
-\f$i, f, o\f$ and \f$c\f$ are respectively the input gate, forget gate,
-output gate and cell activation vectors, all of which are the same size as
+is the non-line activations, such as logistic sigmoid function, and
+\f$i, f, o\f$ and \f$c\f$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
 the cell output activation vector \f$h\f$.

-The ⊙ is the element-wise product of the vectors, \f$act_g\f$ and \f$act_h\f$
-are the cell input and cell output activation functions, `tanh` is usually
+The \f$\odot\f$ is the element-wise product of the vectors. \f$act_g\f$ and \f$act_h\f$
+are the cell input and cell output activation functions and `tanh` is usually
 used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
 which is computed based on the current input and the previous hidden state.

-Set `usePeepholes` False to disable peephole connection [2]. The formula
+Set usePeepholes False to disable peephole connection 
+(http://www.bioinf.jku.at/publications/older/2604.pdf). The formula
 is omitted here.

-@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
-operations on the input x_{t} were NOT included in this operator.
+Note that these \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
+operations on the input \f$x_{t}\f$ are NOT included in this operator.
 Users can choose to use fully-connect operator before LSTM operator.

-[1] Hasim Sak, Andrew Senior, and Francoise Beaufays. Long short-term memory
-recurrent neural network architectures for large scale acoustic modeling.
-INTERSPEECH, 2014.
-
-[2] S. Hochreiter and J. Schmidhuber. Long Short-Term Memory.
-Neural Computation, 9(8):1735-1780, 1997.
-
 )DOC");
  }
 };

--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -57,17 +57,22 @@ class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
        "The cell state tensor of last time-step in the Lstm Unit operator.");
    AddOutput("C", "The cell tensor of Lstm Unit operator.");
    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
-
-    AddComment(R"DOC(Lstm-Unit Operator
+    AddAttr<float>("forget_bias",
+                   "(float, default 0.0) "
+                   "The forget bias of Lstm Unit.")
+        .SetDefault(0.0);
+    AddComment(R"DOC(
+Lstm Unit Operator

 Equation:
-  i, f, o, j = split(X)
-  C = C_prev * sigm(f + forget_bias) + sigm(i) * tanh(j)
-  H = C * sigm(o)
+
+$$
+i, f, o, j = split(X) \\
+C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\
+H = C * sigm(o)
+$$

 )DOC");
-    AddAttr<float>("forget_bias", "The forget bias of Lstm Unit.")
-        .SetDefault(0.0);
  }
 };