diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index af02721eb72c1d0f8aa3d7ab8db504c4c33b64d5..c280ff21eec8d1a90b8be9102d7eae119f38f2b1 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -272,8 +272,7 @@ class DataFeeder(object):
             dict: the result of conversion.
 
         Raises:
-            ValueError: If drop_last is False and the data batch which cannot
-            fit for devices.
+            ValueError: If drop_last is False and the data batch which cannot fit for devices.
         """
 
         def __reader_creator__():
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 823b6d80be13b1baf1e62ef616cdf68ff7515a68..45e6a856f209d0b5badb22ce40063960087809d9 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1638,8 +1638,8 @@ class Program(object):
                 parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
                 to print.
 
-        Returns
-            (str): The debug string.
+        Returns:
+            str : The debug string.
 
         Raises:
             ValueError: If any of required fields is not set and throw_on_error is
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 9d98e8333ba07ac3eed3a3b63adcba1919cb4694..a7494aaceab42332cb4362ab1df43d9e0b139f4f 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1452,6 +1452,7 @@ class DynamicRNN(object):
     def step_input(self, x):
         """
         Mark a sequence as a dynamic RNN input.
+
         Args:
             x(Variable): The input sequence.
 
@@ -1505,6 +1506,7 @@ class DynamicRNN(object):
         """
         Mark a variable as a RNN input. The input will not be scattered into
         time steps.
+
         Args:
             x(Variable): The input variable.
 
@@ -1629,13 +1631,11 @@ class DynamicRNN(object):
         Args:
             init(Variable|None): The initialized variable.
 
-            shape(list|tuple): The memory shape. NOTE the shape does not contain
-            batch_size.
+            shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.
 
             value(float): the initalized value.
 
-            need_reorder(bool): True if the initialized memory depends on the
-            input sample.
+            need_reorder(bool): True if the initialized memory depends on the input sample.
 
             dtype(str|numpy.dtype): The data type of the initialized memory.
 
@@ -1714,6 +1714,7 @@ class DynamicRNN(object):
         """
         Update the memory from ex_mem to new_mem. NOTE that the shape and data
         type of :code:`ex_mem` and :code:`new_mem` must be same.
+        
         Args:
             ex_mem(Variable): the memory variable.
             new_mem(Variable): the plain variable generated in RNN block.
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index ce731f39ea099a4d8948812989ad19b3cce119ff..8aed97dc59b100d4e37832e0a148d73662742ba0 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred,
                       rpn_negative_overlap=0.3,
                       use_random=True):
     """
-    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
+    **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**
 
     This layer can be, for given the  Intersection-over-Union (IoU) overlap
     between anchors and ground truth boxes, to assign classification and
@@ -135,19 +135,20 @@ def rpn_target_assign(bbox_pred,
     Examples:
         .. code-block:: python
 
-        bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
-                          append_batch_size=False, dtype='float32')
-        cls_logits = layers.data(name='cls_logits', shape=[100, 1],
-                          append_batch_size=False, dtype='float32')
-        anchor_box = layers.data(name='anchor_box', shape=[20, 4],
-                          append_batch_size=False, dtype='float32')
-        gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
-                         append_batch_size=False, dtype='float32')
-        loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
-            fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
-                                          cls_logits=cls_logits,
-                                          anchor_box=anchor_box,
-                                          gt_boxes=gt_boxes)
+            bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
+                              append_batch_size=False, dtype='float32')
+            cls_logits = layers.data(name='cls_logits', shape=[100, 1],
+                              append_batch_size=False, dtype='float32')
+            anchor_box = layers.data(name='anchor_box', shape=[20, 4],
+                              append_batch_size=False, dtype='float32')
+            gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
+                             append_batch_size=False, dtype='float32')
+            loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
+                fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
+                                              cls_logits=cls_logits,
+                                              anchor_box=anchor_box,
+                                              gt_boxes=gt_boxes)
+
     """
 
     helper = LayerHelper('rpn_target_assign', **locals())
@@ -1519,27 +1520,30 @@ def anchor_generator(input,
     Args:
        input(Variable): The input feature map, the format is NCHW.
        anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
-       given in absolute pixels e.g. [64., 128., 256., 512.].
-       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
+                                       given in absolute pixels e.g. [64., 128., 256., 512.].
+                                       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
        aspect_ratios(list|tuple|float): The height / width ratios of generated
-            anchors, e.g. [0.5, 1.0, 2.0].
+                                        anchors, e.g. [0.5, 1.0, 2.0].
        variance(list|tuple): The variances to be used in box regression deltas.
-            Default:[0.1, 0.1, 0.2, 0.2].
-       stride(list|turple): The anchors stride across width and height,
-            e.g. [16.0, 16.0]
+                             Default:[0.1, 0.1, 0.2, 0.2].
+       stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0]
        offset(float): Prior boxes center offset. Default: 0.5
        name(str): Name of the prior box op. Default: None.
 
     Returns:
-        Anchors(Variable):  The output anchors with a layout of [H, W, num_anchors, 4].
-              H is the height of input, W is the width of input,
-              num_anchors is the box count of each position.
-              Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-        Variances(Variable): The expanded variances of anchors
-              with a layout of [H, W, num_priors, 4].
-              H is the height of input, W is the width of input
-              num_anchors is the box count of each position.
-              Each variance is in (xcenter, ycenter, w, h) format.
+        Anchors(Variable),Variances(Variable):  
+        
+              two variables:
+        
+              - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \
+                H is the height of input, W is the width of input, \
+                num_anchors is the box count of each position.  \
+                Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. 
+              - Variances(Variable): The expanded variances of anchors \
+                with a layout of [H, W, num_priors, 4]. \
+                H is the height of input, W is the width of input \
+                num_anchors is the box count of each position. \
+                Each variance is in (xcenter, ycenter, w, h) format.
 
 
     Examples:
@@ -1748,35 +1752,35 @@ def generate_proposals(scores,
                        eta=1.0,
                        name=None):
     """
-    ** Generate proposal Faster-RCNN **
-	
-	This operation proposes RoIs according to each box with their probability to be a foreground object and 
-	the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
-	could be used to train detection net.
-
-	For generating proposals, this operation performs following steps:
-
-	1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
- 	2. Calculate box locations as proposals candidates. 
-	3. Clip boxes to image
-	4. Remove predicted boxes with small area. 
-	5. Apply NMS to get final proposals as output.
-	
-      
-	Args:
-		scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
-			N is batch size, A is number of anchors, H and W are height and width of the feature map.
-		bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
-		im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
-			between origin image size and the size of feature map.
-		anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
-              		num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-		variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
-		pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
-		post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
-		nms_thresh(float): Threshold in NMS, 0.5 by default.
-		min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
-		eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+    **Generate proposal Faster-RCNN**
+
+    This operation proposes RoIs according to each box with their probability to be a foreground object and 
+    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+    could be used to train detection net.
+
+    For generating proposals, this operation performs following steps:
+
+    1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+    2. Calculate box locations as proposals candidates. 
+    3. Clip boxes to image
+    4. Remove predicted boxes with small area. 
+    5. Apply NMS to get final proposals as output.
+
+    Args:
+        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and width of the feature map.
+        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
+        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+            between origin image size and the size of feature map.
+        anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
+                    num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+        variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
+        pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+        nms_thresh(float): Threshold in NMS, 0.5 by default.
+        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
+        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+
     """
     helper = LayerHelper('generate_proposals', **locals())
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 42f4959a83fe113d6cbbe0db355249a9c203d602..9a29b2509357c93a684d736cf0d2523970fb5ff1 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size):
     is determined by argument buf_size.
 
     Args:
-        param reader: the original reader whose output will be shuffled.
-        type reader: callable
-        param buf_size: shuffle buffer size.
-        type buf_size: int
-        return: the new reader whose output is shuffled.
-        rtype: callable
+        reader(callable): the original reader whose output will be shuffled.
+        buf_size(int): shuffle buffer size.
+
+    Returns:
+        callable: the new reader whose output is shuffled.
     """
     return __create_unshared_decorated_reader__(
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cc1fdbd285611379cc4fa44d2373748aa6e24faf..8f43c6f226bf51ef387acbb335412d5a84516257 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -233,7 +233,7 @@ def fc(input,
             dimensions will be flatten to form the first dimension of the final matrix (height of
             the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
             form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
             Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
         param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
             parameters/weights of this layer.
@@ -502,46 +502,48 @@ def lstm(input,
     If Device is GPU, This op will use cudnn LSTM implementation
 
     A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
     the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
-    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
-
-    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
-
-    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
-
-    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
-
-    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
-
-    $$ h_t = o_t \\odot tanh(c_t) $$
-
-    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+    .. math::
+    
+       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) 
+       
+       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) 
+       
+       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) 
+       
+       \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
+       
+       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} 
+       
+       h_t &= o_t \odot tanh(c_t) 
+
+    - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
       of weights from the input gate to the input)
     - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
     - sigmoid is the logistic sigmoid function.
     - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
       and cell activation vectors, respectively, all of which have the same size as
       the cell output activation vector $h$.
-    - The $\odot$ is the element-wise product of the vectors.
-    - `tanh` is the activation functions.
-    - $\tilde{c_t}$ is also called candidate hidden state,
+    - The :math:`\odot` is the element-wise product of the vectors.
+    - :math:`tanh` is the activation functions.
+    - :math:`\\tilde{c_t}` is also called candidate hidden state,
       which is computed based on the current input and the previous hidden state.
 
-    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
+    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, 
     X represensts a matrix multiplication
 
 
     Args:
         input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM
+        init_h(Variable): The initial hidden state of the LSTM                       
                        This is a tensor with shape ( num_layers x batch_size x hidden_size)
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
         init_c(Variable): The initial cell state of the LSTM.
                        This is a tensor with shape ( num_layers x batch_size x hidden_size )
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
         hidden_size (int): hidden size of the LSTM
         num_layers (int): total layers number of the LSTM
         dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -556,14 +558,18 @@ def lstm(input,
 
 
     Returns:
-        rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
-                         if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
-        last_h(Tensor): the hidden state of the last step of LSTM
-                        shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
-        last_c(Tensor): the cell state of the last step of LSTM
-                        shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
+        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):  
+                        
+                        Three tensors, rnn_out, last_h, last_c:
+                        
+                        - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
+                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
+                        - last_h is the hidden state of the last step of LSTM \
+                          shape is ( num_layers x batch_size x hidden_size ) \
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                        - last_c(Tensor): the cell state of the last step of LSTM \
+                          shape is ( num_layers x batch_size x hidden_size ) \
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
 
 
     Examples:
@@ -1220,6 +1226,8 @@ def dropout(x,
     probability) the outputs of some units to zero, while others are remain
     unchanged.
 
+    dropout op can be removed from the program to make the program more efficient.
+
     Args:
         x (Variable): The input tensor variable.
         dropout_prob (float): Probability of setting units to zero.
@@ -1230,22 +1238,24 @@ def dropout(x,
                     units will be dropped. DO NOT use a fixed seed in training.
         name (str|None): A name for this layer(optional). If set None, the layer
                          will be named automatically.
-        dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
+        dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train']
+
                                         1. downgrade_in_infer(default), downgrade the outcome at inference
-                                           train: out = input * mask
-                                           inference: out = input * dropout_prob
-                                           (make is a tensor same shape with input, value is 0 or 1
-                                            ratio of 0 is dropout_prob)
+
+                                           - train: out = input * mask
+                                           - inference: out = input * dropout_prob
+
+                                           (mask is a tensor same shape with input, value is 0 or 1
+                                           ratio of 0 is dropout_prob)
                                         2. upscale_in_train, upscale the outcome at training time
-                                           train: out = input * mask / ( 1.0 - dropout_prob )
-                                           inference: out = input
-                                           (make is a tensor same shape with input, value is 0 or 1
-                                            ratio of 0 is dropout_prob)
-                                           dropout op can be removed from the program.
-                                           the program will be efficient
 
+                                           - train: out = input * mask / ( 1.0 - dropout_prob )
+                                           - inference: out = input
 
+                                           (mask is a tensor same shape with input, value is 0 or 1
+                                           ratio of 0 is dropout_prob)
 
+                                        
     Returns:
         Variable: A tensor variable is the shape with `x`.
 
@@ -1333,11 +1343,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
          A 2-D tensor with shape [N x 1], the cross entropy loss.
 
     Raises:
-        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal.
-                      2) when `soft_label == True`, and the 2nd dimension of
-                         `input` and `label` are not equal.
-                      3) when `soft_label == False`, and the 2nd dimension of
-                         `label` is not 1.
+         ValueError:
+
+                      1. the 1st dimension of ``input`` and ``label`` are not equal.
+                      
+                      2. when ``soft_label == True``, and the 2nd dimension of
+                         ``input`` and ``label`` are not equal.
+                         
+                      3. when ``soft_label == False``, and the 2nd dimension of
+                         ``label`` is not 1.
 
     Examples:
         .. code-block:: python
@@ -1457,8 +1471,8 @@ def chunk_eval(input,
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
 
-    For some basics of chunking, please refer to
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    For some basics of chunking, please refer to 
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
 
     ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
     and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
@@ -1823,7 +1837,7 @@ def conv2d(input,
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-             and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
@@ -2276,7 +2290,7 @@ def sequence_slice(input, offset, length, name=None):
 
     .. code-block:: text
 
-	- Case:
+              - Case:
 
             Given the input Variable **input**:
 
@@ -2292,7 +2306,8 @@ def sequence_slice(input, offset, length, name=None):
                 out.lod = [[2, 1]],
                 out.dims = (3, 2).
 
-    NOTE: The first dimension size of **input**, **offset** and **length**
+    Note: 
+          The first dimension size of **input**, **offset** and **length**
           should be equal. The **offset** should start from 0.
 
     Args:
@@ -3013,7 +3028,7 @@ def group_norm(input,
     """
     **Group Normalization Layer**
 
-    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
 
     Args:
         input(Variable): The input tensor variable.
@@ -3140,8 +3155,8 @@ def conv2d_transpose(input,
 
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
 
     Args:
         input(Variable): The input image with [N, C, H, W] format.
@@ -4673,7 +4688,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.5, 0.1, 0.3, 0.1]]
 
         input.lod = [[4, 4]]
-
+      
         Computation:
 
         step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
@@ -4704,10 +4719,10 @@ def ctc_greedy_decoder(input, blank, name=None):
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
-                  'Lp' is the sum if all output sequences' length. If all the sequences
-                  in result were empty, the result LoDTensor will be [-1] with
-                  LoD [[]] and dims [1, 1].
+        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
+                  'Lp' is the sum if all output sequences' length. If all the sequences \
+                  in result were empty, the result LoDTensor will be [-1] with  \
+                  LoD [[]] and dims [1, 1]. 
 
     Examples:
         .. code-block:: python
@@ -5060,7 +5075,7 @@ def hsigmoid(input,
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to
+    complete binary tree, or you can use is_custom to pass your own tree to 
     implement hierarchical. Each leaf node represents a class(a word) and each
     internal node acts as a binary classifier. For each word there's a unique
     path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -5072,13 +5087,13 @@ def hsigmoid(input,
     <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
 
     And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
-        1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
-        2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
-        3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
-         means label of each binary classification, using 1 indicate true, 0 indicate false.
-        4. now, each word should has its path and code along the path, you can pass a batch of path and code
-        related to the same batch of inputs.
 
+    1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
+    2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
+    3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
+       means label of each binary classification, using 1 indicate true, 0 indicate false.
+    4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+       related to the same batch of inputs.
 
     Args:
         input (Variable): The input tensor variable with shape
@@ -5086,8 +5101,8 @@ def hsigmoid(input,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
             which indicates the num of classes using by binary classify.
         param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
              of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
@@ -5100,15 +5115,15 @@ def hsigmoid(input,
              is not set, the bias is initialized zero. Default: None.
         name (str|None): A name for this layer(optional). If set None, the layer
              will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root,
+        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
             it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
-            structure and each element in this array is indexes in parent nodes' Weight Matrix.
-        path_code:  (Variable|None) this variable can store each batch of samples' code,
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
+            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
+        path_code:  (Variable|None) this variable can store each batch of samples' code, 
             each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
              set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
              of W and input will be sparse.
 
     Returns:
@@ -5485,11 +5500,11 @@ def softmax_with_cross_entropy(logits,
 
     .. math::
 
-        max_j = \\max_{i=0}^{K}{\\text{logit}_i}
+        max_j &= \\max_{i=0}^{K}{\\text{logit}_i}
 
-        log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
 
-        softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
+        softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
 
     and then cross entropy loss is calculated by softmax and label.
 
@@ -5515,11 +5530,11 @@ def softmax_with_cross_entropy(logits,
                                along with the cross entropy loss. Default: False
 
     Returns:
-        Variable or Tuple of two Variables: Return the cross entropy loss if
-                              `return_softmax` is False, otherwise the tuple
-                              (loss, softmax), where the cross entropy loss is
-                              a 2-D tensor with shape [N x 1], and softmax is a
-                              2-D tensor with shape [N x K].
+        Variable or Tuple of two Variables: Return the cross entropy loss if \
+                                            `return_softmax` is False, otherwise the tuple \
+                                            (loss, softmax), where the cross entropy loss is \
+                                            a 2-D tensor with shape [N x 1], and softmax is a \
+                                            2-D tensor with shape [N x K].
 
     Examples:
         .. code-block:: python
@@ -5792,21 +5807,27 @@ def squeeze(input, axes, name=None):
     the single dimensions will be removed from the shape. If an axis is
     selected with shape entry not equal to one, an error is raised.
 
-    Examples:
-    Case 1:
-      Given
-        X.shape = (1, 3, 1, 5)
-      and
-        axes = [0]
-      we get:
-        Out.shape = (3, 1, 5)
-      Case 2:
-        Given
-          X.shape = (1, 3, 1, 5)
-        and
-          axes = []
-        we get:
-          Out.shape = (3, 5)
+    For example:
+
+    .. code-block:: text
+
+        Case 1:
+
+          Given
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = [0]
+          we get:
+            Out.shape = (3, 1, 5)
+
+        Case 2:
+
+          Given
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = []
+          we get:
+            Out.shape = (3, 5)
 
     Args:
         input (Variable): The input variable to be squeezed.
@@ -5842,6 +5863,9 @@ def unsqueeze(input, axes, name=None):
     Dimension indices in axes are as seen in the output tensor.
 
     For example:
+
+    .. code-block:: text
+
       Given a tensor such that tensor with shape [3, 4, 5],
       then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
 
@@ -6729,8 +6753,11 @@ def sequence_scatter(input, index, updates, name=None):
     the columns to update in each row of X.
 
     Here is an example:
+
     Given the following input:
+
     .. code-block:: text
+
         input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                       [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                       [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
@@ -6743,7 +6770,9 @@ def sequence_scatter(input, index, updates, name=None):
         updates.lod =  [[  0,            3,                                 8,                         12]]
 
     Then we have the output:
+
     .. code-block:: text
+
         out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
                     [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
                     [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
@@ -6759,7 +6788,7 @@ def sequence_scatter(input, index, updates, name=None):
         name (str|None): The output variable name. Default None.
 
     Returns:
-        output (Variable): The output is a tensor with the same shape as input.
+        Variable: The output is a tensor with the same shape as input.
 
     Examples:
 
@@ -6933,7 +6962,7 @@ def mean_iou(input, label, num_classes):
 
     .. math::
 
-        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
+        IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}.
 
     The predictions are accumulated in a confusion matrix and mean-IOU
     is then calculated from it.
@@ -6946,9 +6975,13 @@ def mean_iou(input, label, num_classes):
         num_classes (int): The possible number of labels.
 
     Returns:
-        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
-        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
-        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
+        mean_iou (Variable),out_wrong(Variable),out_correct(Variable): 
+        
+                     Three variables:
+                      
+                     - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
+                     - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
+                     - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
 
     Examples:
 
@@ -7143,8 +7176,8 @@ def affine_grid(theta, out_shape, name=None):
 
     Args:
         theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
-        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
-        out_shape can be a Variable or a list or tuple.
+        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. 
+                                             ``out_shape`` can be a Variable or a list or tuple.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
@@ -7157,6 +7190,7 @@ def affine_grid(theta, out_shape, name=None):
     Examples:
 
         .. code-block:: python
+
             theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
             out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
             data = fluid.layers.affine_grid(theta, out_shape)
@@ -7192,9 +7226,10 @@ def affine_grid(theta, out_shape, name=None):
 
 def rank_loss(label, left, right, name=None):
     """
+
     **Rank loss layer for RankNet**
 
-    RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+    `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
     is a pairwise ranking model with a training sample consisting of a pair
     of documents, A and B. Label P indicates whether A is ranked higher than B
     or not:
@@ -7202,16 +7237,19 @@ def rank_loss(label, left, right, name=None):
     P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
     about the rank of the input pair.
 
-    Rank loss layer takes three inputs: left (o_i), right (o_j) and
-    label (P_{i,j}). The inputs respectively represent RankNet's output scores
+    Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
+    label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
     for documents A and B and the value of label P. The following equation
     computes rank loss C_{i,j} from the inputs:
 
-    $$
-      C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
-      o_{i,j} =  o_i - o_j  \\
-      \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-    $$
+    .. math::
+
+      C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
+
+      o_{i,j} &=  o_i - o_j  \\\\
+
+      \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
+
 
     Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).
 
@@ -7237,7 +7275,6 @@ def rank_loss(label, left, right, name=None):
             right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
             out = fluid.layers.rank_loss(label, left, right)
 
-
     """
     helper = LayerHelper('rank_loss', **locals())
 
@@ -7269,7 +7306,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
 
     .. math::
 
-        rank\_loss &= max(0, -label * (left - right) + margin)
+        rank\_loss = max(0, -label * (left - right) + margin)
 
     Args:
        label (Variable): Indicates whether the left is ranked higher than the right or not.
@@ -7278,12 +7315,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
        margin (float): Indicates the given margin.
        name (str|None): A name for this layer (optional). If set None, the layer
                        will be named automatically.
+
     Returns:
        Variable: The ranking loss.
+
     Raises:
        ValueError: Any of label, left, and right is not a Variable.
+
     Examples:
+
         .. code-block:: python
+
            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
            left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
@@ -7587,7 +7629,8 @@ def prelu(x, mode, param_attr=None, name=None):
     """
     Equation:
 
-        y = \max(0, x) + alpha * \min(0, x)
+    .. math::
+        y = \max(0, x) + \\alpha * \min(0, x)
 
     Args:
         x (Variable): The input tensor.
@@ -7653,8 +7696,8 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
 
         .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0)
     """
     helper = LayerHelper('brelu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7683,8 +7726,8 @@ def leaky_relu(x, alpha=0.02, name=None):
 
         .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.leaky_relu(x, alpha=0.01)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.leaky_relu(x, alpha=0.01)
     """
     helper = LayerHelper('leaky_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7712,8 +7755,8 @@ def soft_relu(x, threshold=40.0, name=None):
 
         .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.soft_relu(x, threshold=20.0)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.soft_relu(x, threshold=20.0)
     """
     helper = LayerHelper('soft_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7729,23 +7772,32 @@ def flatten(x, axis=1, name=None):
     """
     **Flatten layer**
     Flattens the input tensor into a 2D matrix.
+    
+    For Example:
+    
+    .. code-block:: text
 
-    Examples:
-    Case 1:
-      Given
-        X.shape = (3, 100, 100, 4)
-      and
-        axis = 2
-      We get:
-        Out.shape = (3 * 100, 4 * 100)
-
-    Case 2:
-      Given
-        X.shape = (3, 100, 100, 4)
-      and
-        axis = 0
-      We get:
-        Out.shape = (1, 3 * 100 * 100 * 4)
+        Case 1:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            axis = 2
+
+          We get:
+            Out.shape = (3 * 100, 4 * 100)
+
+        Case 2:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            axis = 0
+
+          We get:
+            Out.shape = (1, 3 * 100 * 100 * 4)
 
     Args:
         x (Variable): A tensor of rank >= axis.
@@ -7759,9 +7811,9 @@ def flatten(x, axis=1, name=None):
                         will be named automatically.
 
     Returns:
-        Variable: A 2D tensor with the contents of the input tensor, with input
-                  dimensions up to axis flattened to the outer dimension of
-                  the output and remaining input dimensions flattened into the
+        Variable: A 2D tensor with the contents of the input tensor, with input \
+                  dimensions up to axis flattened to the outer dimension of \
+                  the output and remaining input dimensions flattened into the \
                   inner dimension of the output.
 
     Raises:
@@ -7801,19 +7853,23 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     The enumerated sequence has the same 1st dimension with variable `input`, and
     the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
 
-    Examples:
-    Case 1:
-      Input:
-        X.lod = [[0, 3, 5]]
-        X.data = [[1], [2], [3], [4], [5]]
-        X.dims = [5, 1]
-      Attrs:
-        win_size = 2
-        pad_value = 0
-      Output:
-        Out.lod = [[0, 3, 5]]
-        Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
-        Out.dims = [5, 2]
+    .. code-block:: text
+
+        Case 1:
+
+          Input:
+            X.lod = [[0, 3, 5]]
+            X.data = [[1], [2], [3], [4], [5]]
+            X.dims = [5, 1]
+
+          Attrs:
+            win_size = 2
+            pad_value = 0
+
+          Output:
+            Out.lod = [[0, 3, 5]]
+            Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+            Out.dims = [5, 2]
 
     Args:
         input (Variable): The input variable which is a index sequence.
@@ -8896,6 +8952,7 @@ def similarity_focus(input, axis, indexes, name=None):
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
+    
     1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
        to the axis according to the indexes. For example, if axis=1 and indexes=[a],
        it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
@@ -8969,14 +9026,16 @@ def similarity_focus(input, axis, indexes, name=None):
         indexes(list): Indicating the indexes of the selected dimension.
 
     Returns:
-        Variable: A tensor variable with the same shape and same type
-            as the input.
+        Variable: A tensor variable with the same shape and same type \
+                  as the input.
 
     Examples:
         .. code-block:: python
+
             data = fluid.layers.data(
               name='data', shape=[2, 3, 2, 2], dtype='float32')
             x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
+
     """
     helper = LayerHelper('similarity_focus', **locals())
     # check attrs
@@ -9055,6 +9114,7 @@ def hash(input, hash_size, num_hash=1, name=None):
 
     Examples:
        .. code-block:: python
+
            word_dict = paddle.dataset.imdb.word_dict()
            x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
            out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
@@ -9075,50 +9135,52 @@ def hash(input, hash_size, num_hash=1, name=None):
 def grid_sampler(x, grid, name=None):
     """
     This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually gennerated by affine_grid. The grid of
+    flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
     shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
     with shape [N, H, W] each, where grid_x is indexing the 4th dimension
     (in width dimension) of input data x and grid_y is indexng the 3rd
     dimention (in height dimension), finally results is the bilinear
     interpolation value of 4 nearest corner points.
 
-    Step 1:
-    Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+    .. code-block:: text
+
+        Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
 
-    grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
-    grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
-    Step 2:
-    Indices input data X with grid (x, y) in each [H, W] area, and bilinear
-    interpolate point value by 4 nearest points.
+        Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+        interpolate point value by 4 nearest points.
 
-      wn ------- y_n ------- en
-      |           |           |
-      |          d_n          |
-      |           |           |
-     x_w --d_w-- grid--d_e-- x_e
-      |           |           |
-      |          d_s          |
-      |           |           |
-      ws ------- y_s ------- wn
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
 
-    x_w = floor(x)              // west side x coord
-    x_e = x_w + 1               // east side x coord
-    y_n = floor(y)              // north side y coord
-    y_s = y_s + 1               // south side y coord
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
 
-    d_w = grid_x - x_w          // distance to west side
-    d_e = x_e - grid_x          // distance to east side
-    d_n = grid_y - y_n          // distance to north side
-    d_s = y_s - grid_y          // distance to south side
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
 
-    wn = X[:, :, y_n, x_w]      // north-west point value
-    en = X[:, :, y_n, x_e]      // north-east point value
-    ws = X[:, :, y_s, x_w]      // south-east point value
-    es = X[:, :, y_s, x_w]      // north-east point value
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
 
-    output = wn * d_e * d_s + en * d_w * d_s
-           + ws * d_e * d_n + es * d_w * d_n
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
 
     Args:
         x(Variable): Input data of shape [N, C, H, W].
@@ -9126,16 +9188,18 @@ def grid_sampler(x, grid, name=None):
         name (str, default None): The name of this layer.
 
     Returns:
-        out(Variable): Output of shape [N, C, H, W] data samples input X
+        Variable: Output of shape [N, C, H, W] data samples input X
         using bilnear interpolation based on input grid.
 
-    Exmples:
-    .. code-block:: python
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
+            theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
+            grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
+            out = fluid.layers.grid_sampler(x=x, grid=grid)
 
-        x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
-        theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
-        grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
-        out = fluid.layers.grid_sampler(x=x, grid=grid)
     """
     helper = LayerHelper("grid_sampler", **locals())
 
@@ -9203,19 +9267,19 @@ def add_position_encoding(input, alpha, beta, name=None):
     """
     **Add Position Encoding Layer**
 
-    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an
     output Tensor of shape [N x M x P] with positional encoding value.
 
-    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+    Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
 
     .. math::
-        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
-        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
-        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+        PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})}  \\\\
+        Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
 
     Where:
-    * PE(pos, 2i): the increment for the number at even position
-    * PE(pos, 2i + 1): the increment for the number at odd position
+      - :math:`PE(pos, 2i)` : the increment for the number at even position
+      - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position
 
     Args:
         input (Variable): 3-D input tensor with shape [N x M x P]
@@ -9230,6 +9294,7 @@ def add_position_encoding(input, alpha, beta, name=None):
         .. code-block:: python
 
           position_tensor = fluid.layers.add_position_encoding(input=tensor)
+
     """
     helper = LayerHelper('add_position_encoding', **locals())
     dtype = helper.input_dtype()
@@ -9262,13 +9327,13 @@ def bilinear_tensor_product(x,
     For example:
 
     .. math::
-       out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+       out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
 
     In this formula:
       - :math:`x`: the first input contains M elements, shape is [batch_size, M].
       - :math:`y`: the second input contains N elements, shape is [batch_size, N].
       - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
-      - :math:`out{i}`: the i-th element of out, shape is [batch_size, size].
+      - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
       - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
 
     Args:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 49a486cf0c3d11b18417e8838aead07d748f3e02..4399d96626b8523c351cc9b22806d04b3e4aca07 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input,
 
     It also sets *stop_gradient* to True.
 
-    >>> data = fluid.layers.fill_constant_batch_size_like(
-    >>>             input=like, shape=[1], value=0, dtype='int64')
-
     Args:
         input(${input_type}): ${input_comment}.
 
@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input,
 
     Returns:
         ${out_comment}.
+
+    Examples:
+
+        .. code-block:: python
+
+             data = fluid.layers.fill_constant_batch_size_like(
+                         input=like, shape=[1], value=0, dtype='int64')
+
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_variable_for_type_inference(dtype=dtype)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 85af8fea13d5b9a1e22014fbd727e1baed3247be..fd07ff0ba3d21721fbbc46099f7dcb6937f93524 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -361,8 +361,8 @@ class ChunkEvaluator(MetricBase):
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
     numbers.
-    For some basics of chunking, please refer to
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    For some basics of chunking, please refer to 
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
     ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
     and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 
@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase):
     def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
         """
         Update the states based on the layers.chunk_eval() ouputs.
+
         Args:
             num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
             num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
@@ -450,9 +451,9 @@ class EditDistance(MetricBase):
                 distance, instance_error = distance_evaluator.eval()
 
     In the above example:
-        'distance' is the average of the edit distance in a pass.
 
-        'instance_error' is the instance error rate in a pass.
+        - 'distance' is the average of the edit distance in a pass.
+        - 'instance_error' is the instance error rate in a pass.
 
     """
 
@@ -567,12 +568,15 @@ class DetectionMAP(object):
     Calculate the detection mean average precision (mAP).
 
     The general steps are as follows:
+
     1. calculate the true positive and false positive according to the input
-        of detection and labels.
+       of detection and labels.
     2. calculate mAP value, support two versions: '11 point' and 'integral'.
 
     Please get more information from the following articles:
+
       https://sanchom.wordpress.com/tag/average-precision/
+
       https://arxiv.org/abs/1512.02325
 
     Args:
@@ -613,10 +617,12 @@ class DetectionMAP(object):
                 for data in batches:
                     loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
 
-        In the above example:
+    In the above example:
+
+            - 'cur_map_v' is the mAP of current mini-batch.
+            - 'accum_map_v' is the accumulative mAP of one pass.
 
-            'cur_map_v' is the mAP of current mini-batch.
-            'accum_map_v' is the accumulative mAP of one pass.
+ 
     """
 
     def __init__(self,
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index d21ec42dccde80fd354a730274edb04f654113c3..c128843885fbce29893a4b24c65482abaf870e82 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size):
 
 class DistributeTranspilerConfig(object):
     """
-    Args:
-        slice_var_up (bool): Do Tensor slice for pservers, default is True.
-        split_method (PSDispatcher): RoundRobin or HashName can be used
-          try to choose the best method to balance loads for pservers.
-        min_block_size (int): Minimum splitted element number in block.
-          According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
+    .. py:attribute:: slice_var_up (bool)
+
+          Do Tensor slice for pservers, default is True.
+
+    .. py:attribute:: split_method (PSDispatcher)
+
+          RoundRobin or HashName can be used.
+          Try to choose the best method to balance loads for pservers.
+
+    .. py:attribute:: min_block_size (int)
+
+          Minimum number of splitted elements in block.
+
+          According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
           We can use bandwidth effiently when data size is larger than 2MB.If you
-          want to change it, please be sure you see the slice_variable function.
+          want to change it, please be sure you have read the slice_variable function.
+
     """
 
     slice_var_up = True