Merge pull request #5354 from kexinzhao/cos_sim_to_dynamic_recur_op

polish operator doc

Merge pull request #5354 from kexinzhao/cos_sim_to_dynamic_recur_op
polish operator doc
7408a4c4 · kexinzhao · GitHub · 906e2565 · 6a07af06 · 7408a4c4
8 changed file
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -33,7 +33,7 @@ class AccuracyOp : public framework::OperatorWithKernel {

    auto inference_dim = ctx->GetInputDim("Out");
    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape with infernece, because
+    // Assume indices has same shape as inference, because
    // it's the output of topk.

    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
@@ -60,20 +60,24 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
                  framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    // TODO(typhoonzero): support both inference value and indices.
-    AddInput("Out", "topk (inferences) the network output");
-    AddInput("Indices", "topk (indices) the network output");
+    AddInput("Out", "The network output of topk (inferences)");
+    AddInput("Indices", "The the network output of topk (indices)");
    AddInput("Label", "Label of the training data");
    // TODO(typhoonzero): AddInput("Weight", ...
    AddOutput("Accuracy", "The accuracy of current batch");

    AddComment(R"DOC(
-Accuracy. It will print accuracy rate for classification.
-The accuracy is:
-..  math::
-accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
+Accuracy Operator. 
+
+It will print accuracy rate for classification.
+The accuracy is calculated as follows:
+
+$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
+
+Both the input Out and Label can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD information 
+with the input Out(Inference).

-Both the input `Out` and `Label` can carry the LoD (Level of Details)
-information, or not. But the output only shares the LoD with input `Inference`.
 )DOC");
  }
 };

--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -29,7 +29,7 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
                 "workspace is a section of GPU memory which will be "
                 "allocated/freed each time the operator runs, larger "
                 "workspace size can increase performance but also requires "
-                 "better hardward. This size should be carefully setted.")
+                 "better hardware. This size should be chosen carefully.")
        .SetDefault(4096);
  }
 };

--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -79,15 +79,16 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Cosine Similarity Operator.

-The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y)).
+$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$

-The input `X` and `Y` must have the same shape, except that the 1st dimension
-of input `Y` could be just 1 (different from input `X`), which will be
-broadcasted to match the shape of input `X` before computing their cosine
+The input X and Y must have the same shape, except that the 1st dimension
+of input Y could be just 1 (different from input X), which will be
+broadcasted to match the shape of input X before computing their cosine
 similarity.

-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input X.
+
 )DOC");
  }
 };

--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -56,34 +56,35 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input of pad op. "
-             "The input should be a k-D tensor(k > 0 and k < 7)");
+             "The input should be a k-D tensor(k > 0 and k < 7).");
    AddInput("Y",
-             "The input used as reference for cropping"
-             " with the same dimension as X. ")
+             "The input used as reference for cropping, "
+             "which is of the same dimensions as X.")
        .AsDispensable();
    AddOutput("Out",
-              "The output of crop op "
-              "with the same dimension as X.");
+              "The output of crop op, "
+              "which is of the same dimensions as X.");
    AddAttr<std::vector<int>>("offsets",
-                              "A list<int> describing offsets to be cropped."
-                              "The size of offsets list should be as same as "
-                              "dimension size of  input X.");
+                              "A list<int> describing offsets to be cropped. "
+                              "The size of offsets list should be the same as "
+                              "the dimension size of input X.");
    AddAttr<std::vector<int>>("shape",
-                              "A list<int> describing the shape of output."
-                              "The size of shape list should be as same as "
-                              "dimension size of  input X.")
+                              "A list<int> describing the shape of output. "
+                              "The size of shape list should be the same as "
+                              "the dimension size of input X.")
        .SetDefault(std::vector<int>());
    AddComment(R"DOC(
 Crop Operator.
+
 Crop input into output, as specified by offsets and shape.

 There are two ways to set shape:
-1. referenc input: crop input X as shape as reference input.
+1. reference input: crop input X into the same shape as reference input.
                    The dimension of reference input should
-                    be as same as input X.
-2. shape list: crop input X by shape described by a list<int>.
-               The size of shape list should be as same as
-               dimension size of  input X.
+                    be the same as the dimension of input X.
+2. shape list: crop input X into the shape described by a list<int>.
+               The size of shape list should be the same as
+               the dimension size of input X.

 The input should be a k-D tensor(k > 0 and k < 7). As an example:

@@ -91,20 +92,20 @@ Given:

    X = [[0, 1, 2, 0, 0]
         [0, 3, 4, 0, 0]
-         [0, 0, 0, 0, 0]]
+         [0, 0, 0, 0, 0]],

 and

-    offsets = [0, 1]
+    offsets = [0, 1],

 and

-    shape = [2, 2]
+    shape = [2, 2],

-then we get
+we get:

    Out = [[1, 2],
-           [3, 4]]
+           [3, 4]].

 )DOC");
  }

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -117,9 +117,9 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
        "Label",
        "(Tensor, default Tensor<int>), the ground truth which is "
        "a 2-D tensor. "
-        "When soft_label is set to false, `Label` is a Tensor<int> with shape "
+        "When soft_label is set to false, Label is a Tensor<int> with shape "
        "[N x 1]. "
-        "When soft_label is set to true, `Label` is a Tensor<float/double> "
+        "When soft_label is set to true, Label is a Tensor<float/double> "
        "with shape [N x K].");
    AddOutput("Y",
              "(Tensor, default Tensor<float>), a 2-D tensor "
@@ -137,13 +137,13 @@ computation.
 1) One-hot cross-entropy:
    soft_label = false, Label[i, 0] indicates the class index for sample i:

-                Y[i] = -log(X[i, Label[i]])
+                $Y[i] = -\log(X[i, Label[i]])$

 2) Soft-label cross-entropy:
    soft_label = true, Label[i, j] indicates the soft label of class j
    for sample i:

-                Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+                $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$

   Please make sure that in this case the summuation of each row of Label
   equals one.
@@ -153,8 +153,9 @@ computation.
     non-zero element (equals 1), soft-label cross-entropy degenerates to a
     one-hot cross-entropy with one-hot label representation.

-Both the input `X` and `Label` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+Both the input X and Label can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input X.
+
 )DOC");
  }
 };

--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -75,11 +75,18 @@ class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Constant for numerical stability")
        .SetDefault(1.0e-6f);
    AddComment(R"DOC(
+Decayed Adagrad Optimizer.

-Decayed Adagrad
+The update is done as follows:

-moment_out = decay * moment + (1 - decay) * grad * grad
-param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+$$
+moment\_out = decay * moment + (1 - decay) * grad * grad \\
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon}
+$$
+
+The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+does not have an epsilon attribute. It is added here for numerical
+stability to avoid the division by zero error.

 )DOC");
  }

--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -43,22 +43,24 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  DropoutOpMaker(framework::OpProto* proto,
                 framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
-        .SetDefault(.5f);
-    AddAttr<bool>("is_training", "Whether in training phase.").SetDefault(true);
-    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
    AddInput("X", "The input of dropout op.");
    AddOutput("Out", "The output of dropout op.");
    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();

+    AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
+        .SetDefault(.5f);
+    AddAttr<bool>("is_training", "True if in training phase.").SetDefault(true);
+    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
+
    AddComment(R"DOC(
 Dropout Operator.

-'Dropout' refers to randomly dropping out units in a nerual network. It is a
+Dropout refers to randomly dropping out units in a nerual network. It is a
 regularization technique for reducing overfitting by preventing neuron
 co-adaption during training. The dropout operator randomly set (according to
 the given dropout probability) the outputs of some units to zero, while others
-being set to their inputs.
+are set equal to their corresponding inputs.
+
 )DOC");
  }
 };

--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
@@ -386,12 +386,13 @@ class DynamicRecurrentOpProtoAndCheckerMaker
        RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward];
    // inputs and outputs stored in proto
    AddInput(name.inlinks,
-             "the inputs that need to be segmented for each step.")
+             "The inputs that need to be segmented for each step.")
        .AsDuplicable();
-    AddInput(name.initial_states, "variables to initialize states.")
+    AddInput(name.initial_states, "Variables to initialize the states.")
        .AsDuplicable();

-    AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
+    AddOutput(name.outlinks,
+              "The outputs that need to be concatenated for all steps.")
        .AsDuplicable();
    AddOutput(name.step_scopes, "step scopes");

@@ -399,7 +400,12 @@ class DynamicRecurrentOpProtoAndCheckerMaker
    AddAttr<std::vector<std::string>>(name.ex_states, "names of ex_states");
    AddAttr<std::vector<std::string>>(name.states, "names of states");

-    AddComment("This is a RNN operator for varience-length sequences.");
+    AddComment(R"DOC(
+Dynamic Recurrent Operator.
+
+This is a RNN operator for varience-length sequences.
+
+)DOC");
  }
 };