diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 483f9888973edc9db6317723c136778d40cc7878..83d35a450d0e8ebf5311cdfd948b066642ccec8c 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -44,7 +44,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Sigmoid operator");
     AddOutput("Y", "Output of Sigmoid operator");
     AddComment(R"DOC(
-Sigmoid activation operator.
+Sigmoid Activation Operator.
 
 $y = 1 / (1 + e^{-x})$
 
@@ -60,7 +60,7 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of LogSigmoid operator");
     AddOutput("Y", "Output of LogSigmoid operator");
     AddComment(R"DOC(
-Logsigmoid activation operator.
+Logsigmoid Activation Operator.
 
 $y = \log(1 / (1 + e^{-x}))$
 
@@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Exp operator");
     AddOutput("Y", "Output of Exp operator");
     AddComment(R"DOC(
-Exp activation operator.
+Exp Activation Operator.
 
 $y = e^x$
 
@@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Relu operator");
     AddOutput("Y", "Output of Relu operator");
     AddComment(R"DOC(
-Relu activation operator.
+Relu Activation Operator.
 
 $y = \max(x, 0)$
 
@@ -109,7 +109,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("alpha", "The small negative slope")
         .SetDefault(static_cast<AttrType>(0.02f));
     AddComment(R"DOC(
-LeakyRelu activation operator.
+LeakyRelu Activation Operator.
 
 $y = \max(x, \alpha * x)$
 
@@ -128,7 +128,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("lambda", "non-negative offset")
         .SetDefault(static_cast<AttrType>(0.5f));
     AddComment(R"DOC(
-Softshrink activation operator.
+Softshrink Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -149,7 +149,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Tanh operator");
     AddOutput("Y", "Output of Tanh operator");
     AddComment(R"DOC(
-Tanh activation operator.
+Tanh Activation Operator.
 
 $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
@@ -165,7 +165,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of TanhShrink operator");
     AddOutput("Y", "Output of TanhShrink operator");
     AddComment(R"DOC(
-TanhShrink activation operator.
+TanhShrink Activation Operator.
 
 $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
@@ -184,7 +184,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
         .SetDefault(static_cast<AttrType>(0.5));
     AddComment(R"DOC(
-HardShrink activation operator.
+HardShrink Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -205,7 +205,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Sqrt operator");
     AddOutput("Y", "Output of Sqrt operator");
     AddComment(R"DOC(
-Sqrt activation operator.
+Sqrt Activation Operator.
 
 $y = \sqrt{x}$
 
@@ -220,7 +220,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Abs operator");
     AddOutput("Y", "Output of Abs operator");
     AddComment(R"DOC(
-Abs activation operator.
+Abs Activation Operator.
 
 $y = |x|$
 
@@ -236,7 +236,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Reciprocal operator");
     AddOutput("Y", "Output of Reciprocal operator");
     AddComment(R"DOC(
-Reciprocal activation operator.
+Reciprocal Activation Operator.
 
 $$y = \frac{1}{x}$$
 
@@ -251,7 +251,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Log operator");
     AddOutput("Y", "Output of Log operator");
     AddComment(R"DOC(
-Log activation operator.
+Log Activation Operator.
 
 $y = \ln(x)$
 
@@ -268,7 +268,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Square operator");
     AddOutput("Y", "Output of Square operator");
     AddComment(R"DOC(
-Square activation operator.
+Square Activation Operator.
 
 $y = x^2$
 
@@ -284,7 +284,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Softplus operator");
     AddOutput("Y", "Output of Softplus operator");
     AddComment(R"DOC(
-Softplus activation operator.
+Softplus Activation Operator.
 
 $y = \ln(1 + e^{x})$
 
@@ -300,7 +300,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Softsign operator");
     AddOutput("Y", "Output of Softsign operator");
     AddComment(R"DOC(
-Softsign activation operator.
+Softsign Activation Operator.
 
 $$y = \frac{x}{1 + |x|}$$
 
@@ -320,7 +320,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
         .SetDefault(static_cast<AttrType>(24));
     AddComment(R"DOC(
-BRelu activation operator.
+BRelu Activation Operator.
 
 $y = \max(\min(x, t_{min}), t_{max})$
 
@@ -339,7 +339,7 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
         .SetDefault(static_cast<AttrType>(40));
     AddComment(R"DOC(
-SoftRelu activation operator.
+SoftRelu Activation Operator.
 
 $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
@@ -357,7 +357,7 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("alpha", "The alpha value of ELU")
         .SetDefault(static_cast<AttrType>(1.0f));
     AddComment(R"DOC(
-ELU activation operator.
+ELU Activation Operator.
 
 Applies the following element-wise computation on the input according to
 https://arxiv.org/abs/1511.07289.
@@ -378,7 +378,7 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("threshold", "The threshold value of Relu6")
         .SetDefault(static_cast<AttrType>(6));
     AddComment(R"DOC(
-Relu6 activation operator.
+Relu6 Activation Operator.
 
 $y = \min(\max(0, x), 6)$
 
@@ -396,7 +396,7 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("factor", "The exponential factor of Pow")
         .SetDefault(static_cast<AttrType>(1));
     AddComment(R"DOC(
-Pow activation operator.
+Pow Activation Operator.
 
 $y = x^{factor}$
 
@@ -416,7 +416,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
         .SetDefault(static_cast<AttrType>(1.7159));
     AddComment(R"DOC(
-STanh activation operator.
+STanh Activation Operator.
 
 $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
@@ -435,7 +435,7 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("threshold", "The threshold location of activation")
         .SetDefault(static_cast<AttrType>(1.0));
     AddComment(R"DOC(
-ThresholdedRelu activation operator.
+ThresholdedRelu Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -461,7 +461,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
         .SetDefault(static_cast<AttrType>(0.5));
     AddComment(R"DOC(
-HardSigmoid activation operator.
+HardSigmoid Activation Operator.
 
 Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
 which is much faster than sigmoid.
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index 638a99addc2119e8f44648cc54b97bd8a892d2bc..d7e8a0ea7632650203106b01531d724cf0b8e085 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -55,8 +55,6 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
              "(2-D tensor with shape [batch_size x 1]) "
              "The label indicating X1 ranked higher than X2 or not, "
              "can only be +1 or -1.");
-    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
-        .SetDefault(static_cast<T>(0));
     AddOutput("Activated",
               "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
               "to indicate whether each element of Output(Out) is activated.")
@@ -64,23 +62,26 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(2-D tensor with shape [batch_size x 1]) "
               "The output loss of MarginRankLoss operator.");
+    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
+        .SetDefault(static_cast<T>(0));
     AddComment(R"DOC(
+MarginRankLoss Operator.
 
-MarginRankLoss operator measures the loss given a pair of training sample
+This operator measures the loss given a pair of training sample
 {`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2`, otherwise `Label = -1`. The loss 
-turns out
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+is calculated as:
 
-loss(X1, X2, Label) = max(0, -Label * (X1 - X2) + margin).
+$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
 
-The attribute `margin` involved here helps make the predictions more robust.
+The attribute `margin` here helps make the predictions more robust.
 Denote the item ranked higher as the positive sample, otherwise the negative 
 sample. If the score of the two samples satisfies 
 
-positive sample - negative sample < margin,
+$positive sample - negative sample < margin$
 
-the pair of samples will contribute to the final loss, which will backpropogate 
-and train the ranking model to enlarge the difference of the two score.
+the pair of samples will contribute to the final loss, which will backpropagate 
+and train the ranking model to enlarge the difference between the two scores.
 
 For batch input with size `batch_size`, `X1`, `X2` and `Label`
 all have the same shape [batch_size x 1].
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
index 5ecbee3b413617e3a5523d9a32e72bc08bd316c5..5a1a6154203d40186f1e41491194b19612931b1f 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -144,7 +144,10 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(false);
     AddComment(R"DOC(
-The MatMul operator is used to perform (batched) matrix multiplication
+MatMul Operator.
+
+
+This operator is used to perform (batched) matrix multiplication
 over the last two dimensions of the input tensors `X` and `Y`.
 
 If a transpose flag is specified, the last two dimensions of the
@@ -166,7 +169,8 @@ The differences are:
 - We add `transpose_X` and `transpose_Y` flags.
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 7caa1c9d0cf4dba33a206c85bcbed1fb1cb4e010..78b4bbca84d4670aba73222f1d679604d7516b02 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -36,7 +36,11 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op");
-    AddComment(R"DOC( Mean Operator
+    AddComment(R"DOC(
+Mean Operator.
+
+Out is a scalar which is the mean of all elements in X. 
+
 )DOC");
   }
 };
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index f7943e99acc5975d077f2319b6f678cfc693c1f3..4684c20208501a3239fd57b35428946bb52af4a0 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -52,14 +52,16 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Y", "The right tensor of minus operator.");
     AddOutput("Out", "The output tensor of minus operator.");
 
-    AddComment(R"DOC(Minus Operator
+    AddComment(R"DOC(
+Minus Operator.
 
 Equation:
 
-    Out = X - Y
+    $Out = X - Y$
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 7b9e9528952d552a69ffe6a628672901c5c1a7fd..28528848af1f467bf38be53f9d05fee6ca3f93cc 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -43,27 +43,35 @@ class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
                            framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "The input tensor of modified huber loss op."
+             "The input tensor of modified huber loss op. "
              "X is 2-D tensor with shape [batch_size, 1].");
     AddInput("Y",
-             "The target labels of modified huber loss op."
-             "The shape of Y is same as X. Values of Y must be 0 or 1.");
+             "The target labels of modified huber loss op. "
+             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
     AddOutput("IntermediateVal",
               "Variable to save intermediate result which will be reused in "
               "backward processing.")
         .AsIntermediate();
     AddOutput("Out", "Classification loss for X.");
     AddComment(R"DOC(
-Modified huber loss is used in binary classification problem. The shape of
-input X and target Y are both [N, 1] and so is the shape of output loss.
-Since target Y is not differentiable, cacluating gradient for Y is illegal.
-The formulation of modified huber loss is:
-
-L(y, f(x)) = max(0, 1 - yf(x))^2  for yf(x) >= -1,
-             -4yf(x)              otherwise.
-
-Make sure the values of target label Y are in {0, 1} here. The operator will
+Modified Huber Loss Operator.
+
+This operator is used in binary classification problem. The shape of
+input X and target Y are both [N, 1] and so is the shape of the output loss.
+Since target Y is not differentiable, calculating gradient for Y is illegal.
+The formula of modified huber loss is:
+
+$$
+L(y, f(x)) = 
+\begin{cases}
+(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
+             -4yf(x),    \quad \text{otherwise}
+\end{cases}
+$$
+
+Make sure the values of target label Y are in {0, 1} here. This operator will
 scale values of Y to {-1, +1} when computing losses and gradients.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index 2d4d6f13720f0e6888edbddcb3243116506227ba..e8ce16f4cfcf83fd13e4d3a5318a4ae0c8c8449c 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -75,17 +75,23 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("VelocityOut", "(Tensor) Output updated velocity");
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("useNesterov", "(bool) Use Nesterov Momentum")
+    AddAttr<bool>("useNesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum")
         .SetDefault(false);
     AddComment(R"DOC(
-
-Momentum Algorithm with a flag for Nestrov Moemntum (momentum).
-
-velocity = mu * velocity + gradient
-if (use_nesterov):
-  param = param - gradient * learning_rate + mu * velocity * learning_rate
-else:
-  param = param - learning_rate * velocity
+Momentum Optimizer.
+
+This optimizer has a flag for Nestrov Momentum.
+The update equations are as follows:
+
+$$
+velocity = mu * velocity + gradient \\
+if (use\_nesterov):   \\
+  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+else:   \\
+  param = param - learning\_rate * velocity. \\
+$$
 
 )DOC");
   }
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 90acf034d905e6ab3ba7bf8c3d29e1ef1161ed0c..3c39ae10dc50084cff284c307167c33c9208a3ce 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -78,6 +78,7 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output of mul op");
     AddAttr<int>(
         "x_num_col_dims",
+        "(int, default 1) "
         R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
             in that case, tensors will be reshaped to a matrix. The matrix's first
             dimension(column length) will be the product of tensor's last
@@ -88,20 +89,24 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         .EqualGreaterThan(1);
     AddAttr<int>(
         "y_num_col_dims",
+        "(int, default 1) "
         R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
              in that case, tensors will be reshaped to a matrix. Just like input `X`.
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Mul operator is used to perform matrix multiplication for input X and Y.
+Mul Operator. 
+
+This operator is used to perform matrix multiplication for input X and Y.
 
 The equation is:
 
-    Out = X * Y
+    $$Out = X * Y$$
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 4d86769026e4b3e3040bdcb3bc6dc2edea58b4b0..234fddcfd55ccc66f6378689dbc426499474b11f 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -66,7 +66,8 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The candidate tensors of multiplex operator.")
         .AsDuplicable();
     AddOutput("Out", "The output tensor of multiplex operator.");
-    AddComment(R"DOC(Multiplex operator
+    AddComment(R"DOC(
+Multiplex Operator.
 
 Multiplex multiple tensors according to the index provided by the index tensor.
 
@@ -77,10 +78,11 @@ the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-y[i] = x_{k}[i]
+$$y[i] = x_{k}[i]$$
 
-where y is the output tensor. `x_{k}` is the k-th input tensor
+where `y` is the output tensor, `x_{k}` is the k-th input tensor,
 and `k = Ids[i]`.
+
 )DOC");
   }
 };