diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 6864e3b0b7ace69f7e4b5f7e129b476d28591a34..bcb48e13bd948b4e91ce8cbd7231a9619fac8d18 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -23,21 +23,21 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
                         framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
-             "(LoDTensor, default: LoDTensor<float>). "
-             "A 2-D LoDTensor with shape [N x D] where N is the size of the "
+             "(LoDTensor, default LoDTensor<float>) "
+             "A 2-D LoDTensor with shape [N x D], where N is the size of the "
              "mini-batch and D is the total tag number. The unscaled emission "
              "weight matrix for the linear chain CRF. ");
     AddInput("Transition",
-             "(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
+             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
              "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
              "operator. See more details in the operator's comments.");
     AddInput("Label",
-             "(LoDTensor, default: LoDTensor<int>). A LoDTensor with shape "
+             "(LoDTensor, default LoDTensor<int>) A LoDTensor with shape "
              "[N x 1], where N is the total element number in a mini-batch. "
              "The ground truth.");
     AddOutput(
         "Alpha",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
         "The forward vectors for the entire batch. Denote it as \f$\alpha\f$. "
         "\f$\alpha$\f is a memo table used to calculate the normalization "
         "factor in CRF. \f$\alpha[k, v]$\f stores the unnormalized "
@@ -49,26 +49,28 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput(
         "EmissionExps",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
         "The exponentials of Input(Emission). This is an intermediate "
         "computational result in forward computation, and will be reused in "
         "backward computation.")
         .AsIntermediate();
     AddOutput(
         "TransitionExps",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
         "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
         "intermediate computational result in forward computation, and "
         "will be reused in backward computation.")
         .AsIntermediate();
     AddOutput(
         "LogLikelihood",
-        "(Tensor, default: Tensor<float>). The logarithm of the conditional "
+        "(Tensor, default Tensor<float>) The logarithm of the conditional "
         "likelihood of each training sample in a mini-batch. This is a 2-D "
         "tensor with shape [S x 1], where S is the sequence number in a "
         "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
         "The output is no longer a LoDTensor.");
     AddComment(R"DOC(
+LinearChainCRF Operator.
+
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
 variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
@@ -82,29 +84,28 @@ and output must be linear sequences. Thus, the graph of such a CRF is a simple
 chain or a line, which results in the linear chain CRF.
 
 This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf and
-http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for reference.
+CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
 
 Equation:
-
-- Denote Input(Emission) to this operator as \f$x\f$ here.
-- The first D values of Input(Transition) to this operator are for starting
+1. Denote Input(Emission) to this operator as \f$x\f$ here.
+2. The first D values of Input(Transition) to this operator are for starting
 weights, denoted as \f$a\f$ here.
-- The next D values of Input(Transition) of this operator are for ending
+3. The next D values of Input(Transition) of this operator are for ending
 weights, denoted as \f$b\f$ here.
-- The remaning values of Input(Transition) are for transition weights,
+4. The remaning values of Input(Transition) are for transition weights,
 denoted as \f$w\f$ here.
-- Denote Input(Label) as \f$s\f$ here.
+5. Denote Input(Label) as \f$s\f$ here.
 
 The probability of a sequence \f$s\f$ of length \f$L\f$ is defined as:
-\f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+\f$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
                  + \sum_{l=1}^L x_{s_l}
                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
 where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
 all possible sequences is \f$1\f$, and \f$x\f$ is the emission feature weight
 to the linear chain CRF.
 
-Finaly, the linear chain CRF operator outputs the logarithm of the conditional
+Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.
 
 NOTE:
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index d39cb2fcf9cc205edf86f8ab1d5e04b5672e00f6..66fcc09bc877867e66a37adc73230d8dabf4cbed 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -48,12 +48,17 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
-    AddAttr<std::vector<int>>("gpus", "gpu id lists");
-    AddAttr<int>("data_type", "output data type")
+    AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
+    AddAttr<int>("data_type",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
         .SetDefault(framework::DataType::FP32);
     AddComment(R"DOC(
-               create communicator.
-        )DOC");
+NCCLInit Operator.
+
+Create communicator.
+
+)DOC");
   }
 };
 
@@ -143,11 +148,15 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
     AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
                          "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
         .SetDefault("ncclSum");
     AddComment(R"DOC(
-            AllReduce the input tensors.
-        )DOC");
+NCCLAllReduce Operator.
+
+AllReduce the input tensors.
+
+)DOC");
   }
 };
 
@@ -161,14 +170,20 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
     AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
                          "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
         .SetDefault("ncclSum");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not "
-                 "set(platform::kInvalidGPUId). hashed by name.")
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
         .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
-            Reduce the tensors)DOC");
+NCCLReduce Operator.
+
+Reduce the tensors.
+
+)DOC");
   }
 };
 
@@ -182,12 +197,16 @@ class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Bcast");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not "
-                 "set(platform::kInvalidGPUId). hashed by name.")
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
         .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
-            Bcast the tensors.
-        )DOC");
+NCCLBcast Operator.
+
+Bcast the tensors.
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 73a0b8baff530840ddd0d4c65cd4c060ab18e401..adb75df6ef10c59fc6f3db4d36e1ffb1ae0b4b1e 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -54,41 +54,44 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input of pad op. "
              "The input should be a k-D tensor(k > 0 and k < 7)");
     AddOutput("Out",
-              "The output of pad op."
+              "The output of pad op. "
               "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules for each dimension. "
+        "For 2-D image tensor, paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings should be equal to "
+        "2 * dimension size of the input tensor.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas.")
+        .SetDefault(0.0f);
     AddComment(R"DOC(
-Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example:
+Pad Operator.
+
+Pad input into output, as specified by paddings and pad_value. 
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
 Given:
 
 X = [[1, 2],
-   [3, 4]]
-
-and
+     [3, 4]],
 
-paddings = [0, 1, 1, 2]
+paddings = [0, 1, 1, 2],
 
 and
 
-pad_value = 0
+pad_value = 0,
 
-then we get
+we have:
 
 Out = [[0, 1, 2, 0, 0]
        [0, 3, 4, 0, 0]
        [0, 0, 0, 0, 0]]
+
 )DOC");
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "A list<int> to describes padding rules for each dimension."
-        " For 2-D image tensor, paddings=[0, 1, 2, 3] means"
-        " padding 0 row to top, 1 row to bottom, 2 columns to left"
-        " and 3 columns to right.Size of paddings should be equal to"
-        " 2 * dimension size of input tensor.");
-    AddAttr<float>("pad_value",
-                   "(float) default to 0; "
-                   "The value to fill padded areas.")
-        .SetDefault(0.0f);
   }
 };
 
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index 4d75c11bc8130343e95f75e687529303179caa93..f58aab733866973f477ca79e5e53ba58adbf3dc7 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -73,125 +73,138 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
   AddInput(
       "X",
       "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of channels, H and W is the height and width of feature.");
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
   AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCHW."
-            "Where N is batch size, C is "
-            "the number of channels, H and W is the height and "
-            "width of feature.");
+            "(Tensor) The output tensor of pooling operator. "
+            "The format of output tensor is also NCHW, "
+            "where N is batch size, C is the number of channels, "
+            "H is the height of the feature, "
+            "and W is the width of the feature.");
 
   AddAttr<std::string>("poolingType",
                        "(string), pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
   AddAttr<std::vector<int>>("ksize",
-                            "(vector ), the pooling window size(height, width) "
-                            "of pooling operator."
+                            "(vector<int>) The pooling window "
+                            "size(height, width) of the pooling operator. "
                             "If globalPooling = true, ksize and paddings will "
                             "be ignored.");  // TODO(Chengduo): Add checker.
                                              // (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<bool>("globalPooling",
-                "(bool default: false), whether to use the global pooling."
+                "(bool, default false) Whether to use the global pooling. "
                 "If globalPooling = true, ksize and paddings will be ignored.")
       .SetDefault(false);
-  AddAttr<std::vector<int>>(
-      "strides",
-      "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default {1, 1}), strides(height, "
+                            "width) of pooling operator.")
       .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector defalut:{0,0}), paddings(height, width) of pooling operator."
+      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
+      "operator."
       "If globalPooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
 
   AddComment(R"DOC(
+Pool2d Operator.
+
 The pooling2d operation calculates the output based on
 the input, poolingType and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the
-number of channels, H and W is the height and width of feature.
+Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, H_in, W_in)
+       X shape: $(N, C, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, H_out, W_out)
-  where
-       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       Out shape: $(N, C, H_{out}, W_{out})$
+  where 
+       $$ 
+       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       $$
+
 )DOC");
 }
 
 Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                              framework::OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "X",
-      "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCDHW. Where N is batch size, C is "
-      "the number of channels, D, H and W is the depth, height and width of "
-      "feature.");
+  AddInput("X",
+           "(Tensor) The input tensor of pooling operator. "
+           "The format of input tensor is NCDHW, where N is batch size, C is "
+           "the number of channels, and D, H and W is the depth, height and "
+           "width of "
+           "the feature, respectively.");
   AddOutput("Out",
             "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCDHW."
-            "Where N is batch size, C is "
-            "the number of channels, D, H and W is the depth, height and "
-            "width of feature.");
+            "The format of output tensor is also NCDHW, "
+            "where N is batch size, C is "
+            "the number of channels, and D, H and W is the depth, height and "
+            "width of the feature, respectively.");
 
   AddAttr<std::string>("poolingType",
-                       "(string), pooling type, can be \"max\" for max-pooling "
+                       "(string) Pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>("ksize",
-                            "(vector ), the pooling window size(depth, height, "
-                            "width) of pooling "
-                            "operator."
-                            "If globalPooling = true, ksize and paddings wille "
-                            "be ignored.");  // TODO(Chengduo): Add checker.
-                                             // (Currently,
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "(vector<int>) The pooling window size(depth, height, "
+      "width) of pooling operator. "
+      "If globalPooling = true, ksize and paddings will "
+      "be ignored.");  // TODO(Chengduo): Add checker.
+                       // (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<bool>("globalPooling",
-                "(bool default: false), whether to use the global pooling."
+                "(bool, default false) Whether to use the global pooling. "
                 "If globalPooling = true, ksize and paddings wille be ignored.")
       .SetDefault(false);
-  AddAttr<std::vector<int>>("strides",
-                            "(vector, default:{1,1,1}), strides(depth, height, "
-                            "width) of pooling operator.")
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int>, default {1,1,1}) Strides(depth, height, "
+      "width) of the pooling operator.")
       .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector defalut:{0,0,0}), paddings(depth, height, "
-      "width) of pooling operator."
-      "If globalPooling = true, ksize and paddings wille be ignored.")
+      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
+      "width) of pooling operator. "
+      "If globalPooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
 
   AddComment(R"DOC(
+Pool3d Operator.
+
 The pooling3d operation calculates the output based on
-the input, poolingType and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCDHW format. Where N is batch
-size, C is the number of channels, D, H and W is the depth, height and
-width of feature. Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
-The input(X) size and output(Out) size may be different.
+the input, poolingType, ksize, strides, and paddings parameters.
+Input(X) and output(Out) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. Parameters(ksize, strides, paddings) 
+are three elements. These three elements represent depth, height and 
+width, respectively. The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, D_in, H_in, W_in)
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, D_out, H_out, W_out)
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
   where
-       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+       $$
+       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
+       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       $$
+
 )DOC");
 }
 }  // namespace operators
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 95e896e7cc33b1aebe78d1af8746a25318048041..a31b3fcb7083ba5bfbcb6666c4641833ae1e69eb 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -89,64 +89,73 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "(Tensor), the input tensor of pooling operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of image.");
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCHW, where N is batch size, C is the "
+        "number of channels, H is the height of the image, "
+        "and W is the width of the image.");
     AddOutput("Out",
-              "(Tensor), the output tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of image.");
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is "
+              "the number of channels, H is the height of the image "
+              "and W is the width of the image.");
     AddOutput("Mask",
-              "(Tensor), the Mask tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is the number of channels, H and W "
-              "is the height and width of image."
-              "The value in it is the index in current feature map");
+              "(Tensor) The Mask tensor of pooling operator."
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is the number of channels, "
+              "H is the height of the image, "
+              "and W is the width of the image. "
+              "It represents the index in the current feature map.");
 
     AddAttr<std::vector<int>>("ksize",
-                              "(vector ), the pooling window size(height, "
-                              "width) of pooling operator."
+                              "(vector<int>) The pooling window size(height, "
+                              "width) of pooling operator. "
                               "If globalPooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
         "globalPooling",
-        "(bool default: false), whether to use the global pooling."
+        "(bool, default false) Whether to use the global pooling. "
         "If globalPooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
-    AddAttr<std::vector<int>>(
-        "strides",
-        "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int>, default {1, 1}), strides(height, "
+                              "width) of pooling operator.")
         .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector defalut:{0, 0}), paddings(height, width) of pooling operator."
+        "(vector<int>, defalut {0, 0}), paddings(height, width) of pooling "
+        "operator. "
         "If globalPooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
     AddComment(R"DOC(
+MaxPool2d Operator.
+
 The maxPooling2d with index operation calculates the output and the mask
-based on the input and ksize, strides, paddings parameters. Input(X) and
-output(Out, Mask) are in NCHW format. Where N is batch size, C is the
-number of channels, H and W is the height and width of feature.
+based on the input, ksize, strides, and paddings parameters. Input(X) and
+output(Out, Mask) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, 
+and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, H_in, W_in)
+       X shape: $(N, C, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, H_out, W_out)
-       Mask shape: (N, C, H_out, W_out)
+       Out shape: $(N, C, H_{out}, W_{out})$
+       Mask shape: $(N, C, H_{out}, W_{out})$
   where
-       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       $$
+       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       $$
+
 )DOC");
   }
 };
@@ -156,70 +165,76 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
   MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
                             framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor), the input tensor of pooling operator. "
-        "The format of input tensor is NCDHW. Where N is batch size, C is "
-        "the number of channels, D, H and W is the depth, height and width of "
-        "image.");
+    AddInput("X",
+             "(Tensor) The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW, where N is batch size, C is "
+             "the number of channels, and D, H and W are the depth, height and "
+             "width of "
+             "the image, respectively");
     AddOutput("Out",
-              "(Tensor), the output tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is "
-              "the number of channels, D, H and W is the depth, height and "
-              "width of image.");
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, "
+              "and D, H and W are the depth, height and "
+              "width of the image, respectively.");
     AddOutput("Mask",
-              "(Tensor), the Mask tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is the number of channels, D, H and W "
-              "is the depth, height and width of image."
-              "The value in it is the index in current feature map");
+              "(Tensor) The Mask tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, and "
+              "D, H and W are the depth, height and width "
+              "of the image, respectively. "
+              "It represents the index in the current feature map.");
 
     AddAttr<std::vector<int>>("ksize",
-                              "(vector), the pooling window size(depth, "
-                              "height, width) of pooling "
-                              "operator."
+                              "(vector<int>) The pooling window size(depth, "
+                              "height, width) of pooling operator. "
                               "If globalPooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
         "globalPooling",
-        "(bool default: false), whether to use the global pooling."
+        "(bool, default false) Whether to use the global pooling. "
         "If globalPooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
-                              "(vector, default:{1,1,1}), strides(depth, "
+                              "(vector<int>, default {1,1,1}), strides(depth, "
                               "height, width) of pooling operator.")
         .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector defalut:{0,0,0}), paddings(depth, "
-        "height, width) of pooling operator."
+        "(vector, defalut {0,0,0}), paddings(depth, "
+        "height, width) of pooling operator. "
         "If globalPooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
     AddComment(R"DOC(
+MaxPool3d Operator.
+
 The maxpooling3d with index operation calculates the output and the mask
 based on the input and ksize, strides, paddings parameters.
-Input(X) and output(Out, Mask) are in NCDHW format. Where N is batch
-size, C is the number of channels, D, H and W is the depth, height and
-width of feature. Parameters(ksize, strides, paddings) are three elements.
+Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. 
+Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, D_in, H_in, W_in)
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, D_out, H_out, W_out)
-       Mask shape: (N, C, D_out, H_out, W_out)
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
   where
-       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+       $$
+       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
+       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       $$
+
 )DOC");
   }
 };
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index 39da1e0bf89ce308de62d38a6cce6dbd4c7c7f83..641f7135ded159b1c7330e87c4b1d983e959b466 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -92,76 +92,78 @@ class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
                          framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("MaxProbs",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each row contains the max probability "
              "of an instance which computed by the previous top_k (k=1) "
              "operator.");
     AddInput("Indices",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each row contains the corresponding "
              "index which computed by the previous top_k (k=1) operator.");
     AddInput("Labels",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each element is a label and the "
              "value should be in [0, class_number - 1].");
     AddInput("Weights",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. This input is optional. If provided, "
              "weight of instance would be considered when computing metrics.")
         .AsDispensable();
     AddInput("StatesInfo",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape D x 4, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
              "where D is the number of classes. This input is optional. If "
              "provided, current state will be accumulated to this state and "
-             "the accumulation state will be as the output state.")
+             "the accumulation state will be the output state.")
         .AsDispensable();
     AddOutput("BatchMetrics",
-              "(Tensor, default Tensor<float>), a 1-D tensor with shape {6}."
-              "This output tensor contains metrics for current batch data."
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for current batch data. "
               "The layout is [macro average precision, macro average recall, "
               "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score]");
+              "micro f1 score].");
     AddOutput("AccumMetrics",
-              "(Tensor, default Tensor<float>), a 1-D tensor with shape {6}."
-              "This output tensor contains metrics for accumulated data."
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for accumulated data. "
               "The layout is [macro average precision, macro average recall, "
               "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score]");
+              "micro f1 score].");
     AddOutput("AccumStatesInfo",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape D x 4, "
+              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
               "where D is equal to class number. This output tensor contains "
               "accumulated state variables used to compute metrics. The layout "
               "for each class is [true positives, false positives, "
               "true negatives, false negatives].");
-    AddAttr<int>("class_number", "Number of classes to be evaluated.");
+    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
     AddComment(R"DOC(
-When given 'Input(Indices)' and 'Input(Labels)', this operator can be used
+Precision Recall Operator.
+
+When given Input(Indices) and Input(Labels), this operator can be used
 to compute various metrics including:
-  - macro average precision
-  - macro average recall
-  - macro f1 score
-  - micro average precision
-  - micro average recall
-  - micro f1 score
+1. macro average precision
+2. macro average recall
+3. macro f1 score
+4. micro average precision
+5. micro average recall
+6. micro f1 score
 
 To compute the above metrics, we need to do statistics for true positives,
-false positives and false negatives. Here count of true negatives is not
+false positives and false negatives. Here the count of true negatives is not
 necessary, but counting it may provide potential usage and the cost is
-trivial, so the operator also provides count of true negatives.
+trivial, so the operator also provides the count of true negatives.
 
 We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
 state contains statistic variables for corresponding class. Layout of each row
 is: TP(true positives), FP(false positives), TN(true negatives),
-FN(false negatives). If 'Input(Weights)' provided, TP, FP, TN, FN will be
-calculated by given weight instead of instance count.
+FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
+calculated by given weight instead of the instance count.
 
 This operator also supports metrics computing for cross-batch situation. To
-achieve this, 'Input(StatesInfo)' should be provided. State of current batch
-data will be accumulated to 'Input(StatesInfo)' and 'Output(AccumStatesInfo)'
+achieve this, Input(StatesInfo) should be provided. State of current batch
+data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
 is the accumulation state.
 
-'Output(BatchMetrics)' is metrics of current batch data while
-'Output(AccumStatesInfo)' is metrics of accumulation data.
+Output(BatchMetrics) is metrics of current batch data while
+Output(AccumStatesInfo) is metrics of accumulation data.
 
 )DOC");
   }
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index eef2e34eaacf59b9adacb343e9a0091ebabeaea3..055c471b4561e5fd3c7a65c6f81d66cdce1a5578 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -41,17 +41,24 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
   PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of prelu operator.");
-    AddInput("Alpha", "The alpha weight of PRelu operator.");
-    AddOutput("Out", "The output tensor of PRelu operator.");
-    AddComment(R"DOC(PRelu operator
+    AddInput("Alpha", "The alpha weight of prelu operator.");
+    AddOutput("Out", "The output tensor of prelu operator.");
+    AddComment(R"DOC(
+PRelu Operator.
 
 The equation is:
 
-  f(x) = alpha * x , for x < 0
-  f(x) = x         , for x >= 0
+$$
+f(x) =
+\begin{cases}
+\alpha * x, \quad  \text{if} \ x < 0 \\
+x,         \qquad  \text{if} \ x >= 0
+\end{cases}
+$$
 
 The input `X` can carry the LoD (Level of Details) information,
-or not. And the output shares the LoD with input `X`.
+or not. And the output shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
index 39fbf800031cd559a49654667e5a6f634384523d..36e460103ab46bf6f1408840a0699793e2be134d 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -83,22 +83,26 @@ class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
                    "L1 regularization strength.")
         .SetDefault(0.0f);
     AddAttr<float>("l2",
-                   "(float, default 0.0)"
+                   "(float, default 0.0) "
                    "L2 regularization strength.")
         .SetDefault(0.0f);
     AddComment(R"DOC(
+Proximal Adagrad Optimizer.
 
-Optimizer that implements the proximal adagrad algorithm.
+Optimizer that implements the proximal adagrad algorithm:
 
-moment = moment + grad * grad
-prox_param = param - learning_rate * grad * (1 / sqrt(moment))
-param = sign(prox_param) / (1 + learning_rate * l2) *
-        max { |prox_param| - learning_rate * l1 , 0 }
+$$
+moment = moment + grad * grad \\
+prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1 , 0)
+$$
 
 The paper that proposed Proximal GD: 
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
 Here, we use the adagrad learning rate as specified here: 
 (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+
 )DOC");
   }
 };
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
index e4b014b9f5866ec0791cba9b3998b1734066eeeb..5693d0ec9ebf4c470dfa5141b6eeee431f24f2ea 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -67,19 +67,23 @@ class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
                    "L1 regularization strength.")
         .SetDefault(0.0f);
     AddAttr<float>("l2",
-                   "(float, default 0.0)"
+                   "(float, default 0.0) "
                    "L2 regularization strength.")
         .SetDefault(0.0f);
     AddComment(R"DOC(
+ProximalGD Operator.
 
-Optimizer that implements the proximal gradient descent algorithm.
+Optimizer that implements the proximal gradient descent algorithm:
 
-prox_param = param - learning_rate * grad
-param = sign(prox_param) / (1 + learning_rate * l2) *
-        max { |prox_param| - learning_rate * l1 , 0 }
+$$
+prox\_param = param - learning\_rate * grad \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1, 0)
+$$        
 
 The paper that proposed Proximal Gradient Descent:
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+
 )DOC");
   }
 };