en api improve format Dec 27

test=develop

en api improve format Dec 27
test=develop
66ea7184 · haowang101779990 · 988bc2b5 · 66ea7184 · 66ea7184 · 66ea7184
9 changed file
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -272,8 +272,7 @@ class DataFeeder(object):
            dict: the result of conversion.

        Raises:
-            ValueError: If drop_last is False and the data batch which cannot
-            fit for devices.
+            ValueError: If drop_last is False and the data batch which cannot fit for devices.
        """

        def __reader_creator__():

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1646,8 +1646,8 @@ class Program(object):
                parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
                to print.

-        Returns
-            (str): The debug string.
+        Returns:
+            str : The debug string.

        Raises:
            ValueError: If any of required fields is not set and throw_on_error is

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1452,6 +1452,7 @@ class DynamicRNN(object):
    def step_input(self, x):
        """
        Mark a sequence as a dynamic RNN input.
+
        Args:
            x(Variable): The input sequence.

@@ -1505,6 +1506,7 @@ class DynamicRNN(object):
        """
        Mark a variable as a RNN input. The input will not be scattered into
        time steps.
+
        Args:
            x(Variable): The input variable.

@@ -1629,13 +1631,11 @@ class DynamicRNN(object):
        Args:
            init(Variable|None): The initialized variable.

-            shape(list|tuple): The memory shape. NOTE the shape does not contain
-            batch_size.
+            shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.

            value(float): the initalized value.

-            need_reorder(bool): True if the initialized memory depends on the
-            input sample.
+            need_reorder(bool): True if the initialized memory depends on the input sample.

            dtype(str|numpy.dtype): The data type of the initialized memory.

@@ -1714,6 +1714,7 @@ class DynamicRNN(object):
        """
        Update the memory from ex_mem to new_mem. NOTE that the shape and data
        type of :code:`ex_mem` and :code:`new_mem` must be same.
+        
        Args:
            ex_mem(Variable): the memory variable.
            new_mem(Variable): the plain variable generated in RNN block.

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred,
                      rpn_negative_overlap=0.3,
                      use_random=True):
    """
-    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
+    **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**

    This layer can be, for given the  Intersection-over-Union (IoU) overlap
    between anchors and ground truth boxes, to assign classification and
@@ -148,6 +148,7 @@ def rpn_target_assign(bbox_pred,
                                              cls_logits=cls_logits,
                                              anchor_box=anchor_box,
                                              gt_boxes=gt_boxes)
+
    """

    helper = LayerHelper('rpn_target_assign', **locals())
@@ -1525,20 +1526,23 @@ def anchor_generator(input,
                                        anchors, e.g. [0.5, 1.0, 2.0].
       variance(list|tuple): The variances to be used in box regression deltas.
                             Default:[0.1, 0.1, 0.2, 0.2].
-       stride(list|turple): The anchors stride across width and height,
-            e.g. [16.0, 16.0]
+       stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0]
       offset(float): Prior boxes center offset. Default: 0.5
       name(str): Name of the prior box op. Default: None.

    Returns:
-        Anchors(Variable):  The output anchors with a layout of [H, W, num_anchors, 4].
-              H is the height of input, W is the width of input,
-              num_anchors is the box count of each position.
+        Anchors(Variable),Variances(Variable):  
+        
+              two variables:
+        
+              - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \
+                H is the height of input, W is the width of input, \
+                num_anchors is the box count of each position.  \
                Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. 
-        Variances(Variable): The expanded variances of anchors
-              with a layout of [H, W, num_priors, 4].
-              H is the height of input, W is the width of input
-              num_anchors is the box count of each position.
+              - Variances(Variable): The expanded variances of anchors \
+                with a layout of [H, W, num_priors, 4]. \
+                H is the height of input, W is the width of input \
+                num_anchors is the box count of each position. \
                Each variance is in (xcenter, ycenter, w, h) format.


@@ -1748,7 +1752,7 @@ def generate_proposals(scores,
                       eta=1.0,
                       name=None):
    """
-    ** Generate proposal Faster-RCNN **
+    **Generate proposal Faster-RCNN**

    This operation proposes RoIs according to each box with their probability to be a foreground object and 
    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
@@ -1762,7 +1766,6 @@ def generate_proposals(scores,
    4. Remove predicted boxes with small area. 
    5. Apply NMS to get final proposals as output.

-      
    Args:
        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
            N is batch size, A is number of anchors, H and W are height and width of the feature map.
@@ -1777,6 +1780,7 @@ def generate_proposals(scores,
        nms_thresh(float): Threshold in NMS, 0.5 by default.
        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+
    """
    helper = LayerHelper('generate_proposals', **locals())


--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size):
    is determined by argument buf_size.

    Args:
-        param reader: the original reader whose output will be shuffled.
-        type reader: callable
-        param buf_size: shuffle buffer size.
-        type buf_size: int
-        return: the new reader whose output is shuffled.
-        rtype: callable
+        reader(callable): the original reader whose output will be shuffled.
+        buf_size(int): shuffle buffer size.
+
+    Returns:
+        callable: the new reader whose output is shuffled.
    """
    return __create_unshared_decorated_reader__(
        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -233,7 +233,7 @@ def fc(input,
            dimensions will be flatten to form the first dimension of the final matrix (height of
            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
            parameters/weights of this layer.
@@ -505,31 +505,33 @@ def lstm(input,
    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:

-    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
+    .. math::
+    
+       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) 
       
-    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) 
       
-    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) 
       
-    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+       \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
       
-    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} 
       
-    $$ h_t = o_t \\odot tanh(c_t) $$
+       h_t &= o_t \odot tanh(c_t) 

-    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+    - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
      of weights from the input gate to the input)
    - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
    - sigmoid is the logistic sigmoid function.
    - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
      and cell activation vectors, respectively, all of which have the same size as
      the cell output activation vector $h$.
-    - The $\odot$ is the element-wise product of the vectors.
-    - `tanh` is the activation functions.
-    - $\tilde{c_t}$ is also called candidate hidden state,
+    - The :math:`\odot` is the element-wise product of the vectors.
+    - :math:`tanh` is the activation functions.
+    - :math:`\\tilde{c_t}` is also called candidate hidden state,
      which is computed based on the current input and the previous hidden state.

-    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
+    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, 
    X represensts a matrix multiplication


@@ -556,13 +558,17 @@ def lstm(input,


    Returns:
-        rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
+        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):  
+                        
+                        Three tensors, rnn_out, last_h, last_c:
+                        
+                        - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
-        last_h(Tensor): the hidden state of the last step of LSTM
-                        shape is ( num_layers x batch_size x hidden_size )
+                        - last_h is the hidden state of the last step of LSTM \
+                          shape is ( num_layers x batch_size x hidden_size ) \
                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
-        last_c(Tensor): the cell state of the last step of LSTM
-                        shape is ( num_layers x batch_size x hidden_size )
+                        - last_c(Tensor): the cell state of the last step of LSTM \
+                          shape is ( num_layers x batch_size x hidden_size ) \
                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     


@@ -1220,6 +1226,8 @@ def dropout(x,
    probability) the outputs of some units to zero, while others are remain
    unchanged.

+    dropout op can be removed from the program to make the program more efficient.
+
    Args:
        x (Variable): The input tensor variable.
        dropout_prob (float): Probability of setting units to zero.
@@ -1230,20 +1238,22 @@ def dropout(x,
                    units will be dropped. DO NOT use a fixed seed in training.
        name (str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
-        dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
+        dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train']
+
                                        1. downgrade_in_infer(default), downgrade the outcome at inference
-                                           train: out = input * mask
-                                           inference: out = input * dropout_prob
-                                           (make is a tensor same shape with input, value is 0 or 1
+
+                                           - train: out = input * mask
+                                           - inference: out = input * dropout_prob
+
+                                           (mask is a tensor same shape with input, value is 0 or 1
                                           ratio of 0 is dropout_prob)
                                        2. upscale_in_train, upscale the outcome at training time
-                                           train: out = input * mask / ( 1.0 - dropout_prob )
-                                           inference: out = input
-                                           (make is a tensor same shape with input, value is 0 or 1
-                                            ratio of 0 is dropout_prob)
-                                           dropout op can be removed from the program.
-                                           the program will be efficient

+                                           - train: out = input * mask / ( 1.0 - dropout_prob )
+                                           - inference: out = input
+
+                                           (mask is a tensor same shape with input, value is 0 or 1
+                                           ratio of 0 is dropout_prob)

                                        
    Returns:
@@ -1333,11 +1343,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
         A 2-D tensor with shape [N x 1], the cross entropy loss.

    Raises:
-        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal.
-                      2) when `soft_label == True`, and the 2nd dimension of
-                         `input` and `label` are not equal.
-                      3) when `soft_label == False`, and the 2nd dimension of
-                         `label` is not 1.
+         ValueError:
+
+                      1. the 1st dimension of ``input`` and ``label`` are not equal.
+                      
+                      2. when ``soft_label == True``, and the 2nd dimension of
+                         ``input`` and ``label`` are not equal.
+                         
+                      3. when ``soft_label == False``, and the 2nd dimension of
+                         ``label`` is not 1.

    Examples:
        .. code-block:: python
@@ -1458,7 +1472,7 @@ def chunk_eval(input,
    F1-score of chunk detection.

    For some basics of chunking, please refer to 
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .

    ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
@@ -2292,7 +2306,8 @@ def sequence_slice(input, offset, length, name=None):
                out.lod = [[2, 1]],
                out.dims = (3, 2).

-    NOTE: The first dimension size of **input**, **offset** and **length**
+    Note: 
+          The first dimension size of **input**, **offset** and **length**
          should be equal. The **offset** should start from 0.

    Args:
@@ -3013,7 +3028,7 @@ def group_norm(input,
    """
    **Group Normalization Layer**

-    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .

    Args:
        input(Variable): The input tensor variable.
@@ -3140,8 +3155,8 @@ def conv2d_transpose(input,

           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )

    Args:
        input(Variable): The input image with [N, C, H, W] format.
@@ -4704,9 +4719,9 @@ def ctc_greedy_decoder(input, blank, name=None):
        name (str): The name of this layer. It is optional.

    Returns:
-        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
-                  'Lp' is the sum if all output sequences' length. If all the sequences
-                  in result were empty, the result LoDTensor will be [-1] with
+        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
+                  'Lp' is the sum if all output sequences' length. If all the sequences \
+                  in result were empty, the result LoDTensor will be [-1] with  \
                  LoD [[]] and dims [1, 1]. 

    Examples:
@@ -5072,6 +5087,7 @@ def hsigmoid(input,
    <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_

    And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
+
    1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
    2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
    3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
@@ -5079,7 +5095,6 @@ def hsigmoid(input,
    4. now, each word should has its path and code along the path, you can pass a batch of path and code 
       related to the same batch of inputs.

-
    Args:
        input (Variable): The input tensor variable with shape
            :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
@@ -5485,11 +5500,11 @@ def softmax_with_cross_entropy(logits,

    .. math::

-        max_j = \\max_{i=0}^{K}{\\text{logit}_i}
+        max_j &= \\max_{i=0}^{K}{\\text{logit}_i}

-        log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)

-        softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
+        softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j)

    and then cross entropy loss is calculated by softmax and label.

@@ -5515,10 +5530,10 @@ def softmax_with_cross_entropy(logits,
                               along with the cross entropy loss. Default: False

    Returns:
-        Variable or Tuple of two Variables: Return the cross entropy loss if
-                              `return_softmax` is False, otherwise the tuple
-                              (loss, softmax), where the cross entropy loss is
-                              a 2-D tensor with shape [N x 1], and softmax is a
+        Variable or Tuple of two Variables: Return the cross entropy loss if \
+                                            `return_softmax` is False, otherwise the tuple \
+                                            (loss, softmax), where the cross entropy loss is \
+                                            a 2-D tensor with shape [N x 1], and softmax is a \
                                            2-D tensor with shape [N x K].

    Examples:
@@ -5792,15 +5807,21 @@ def squeeze(input, axes, name=None):
    the single dimensions will be removed from the shape. If an axis is
    selected with shape entry not equal to one, an error is raised.

-    Examples:
+    For example:
+
+    .. code-block:: text
+
        Case 1:
+
          Given
            X.shape = (1, 3, 1, 5)
          and
            axes = [0]
          we get:
            Out.shape = (3, 1, 5)
+
        Case 2:
+
          Given
            X.shape = (1, 3, 1, 5)
          and
@@ -5842,6 +5863,9 @@ def unsqueeze(input, axes, name=None):
    Dimension indices in axes are as seen in the output tensor.

    For example:
+
+    .. code-block:: text
+
      Given a tensor such that tensor with shape [3, 4, 5],
      then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].

@@ -6729,8 +6753,11 @@ def sequence_scatter(input, index, updates, name=None):
    the columns to update in each row of X.

    Here is an example:
+
    Given the following input:
+
    .. code-block:: text
+
        input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                      [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                      [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
@@ -6743,7 +6770,9 @@ def sequence_scatter(input, index, updates, name=None):
        updates.lod =  [[  0,            3,                                 8,                         12]]

    Then we have the output:
+
    .. code-block:: text
+
        out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
                    [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
                    [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
@@ -6759,7 +6788,7 @@ def sequence_scatter(input, index, updates, name=None):
        name (str|None): The output variable name. Default None.

    Returns:
-        output (Variable): The output is a tensor with the same shape as input.
+        Variable: The output is a tensor with the same shape as input.

    Examples:

@@ -6933,7 +6962,7 @@ def mean_iou(input, label, num_classes):

    .. math::

-        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
+        IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}.

    The predictions are accumulated in a confusion matrix and mean-IOU
    is then calculated from it.
@@ -6946,9 +6975,13 @@ def mean_iou(input, label, num_classes):
        num_classes (int): The possible number of labels.

    Returns:
-        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
-        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
-        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
+        mean_iou (Variable),out_wrong(Variable),out_correct(Variable): 
+        
+                     Three variables:
+                      
+                     - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
+                     - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
+                     - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.

    Examples:

@@ -7144,7 +7177,7 @@ def affine_grid(theta, out_shape, name=None):
    Args:
        theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. 
-        out_shape can be a Variable or a list or tuple.
+                                             ``out_shape`` can be a Variable or a list or tuple.
        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.

@@ -7157,6 +7190,7 @@ def affine_grid(theta, out_shape, name=None):
    Examples:

        .. code-block:: python
+
            theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
            out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
            data = fluid.layers.affine_grid(theta, out_shape)
@@ -7192,9 +7226,10 @@ def affine_grid(theta, out_shape, name=None):

 def rank_loss(label, left, right, name=None):
    """
+
    **Rank loss layer for RankNet**

-    RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+    `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
    is a pairwise ranking model with a training sample consisting of a pair
    of documents, A and B. Label P indicates whether A is ranked higher than B
    or not:
@@ -7202,16 +7237,19 @@ def rank_loss(label, left, right, name=None):
    P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
    about the rank of the input pair.

-    Rank loss layer takes three inputs: left (o_i), right (o_j) and
-    label (P_{i,j}). The inputs respectively represent RankNet's output scores
+    Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
+    label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
    for documents A and B and the value of label P. The following equation
    computes rank loss C_{i,j} from the inputs:

-    $$
-      C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
-      o_{i,j} =  o_i - o_j  \\
-      \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-    $$
+    .. math::
+
+      C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
+
+      o_{i,j} &=  o_i - o_j  \\\\
+
+      \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
+

    Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).

@@ -7237,7 +7275,6 @@ def rank_loss(label, left, right, name=None):
            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
            out = fluid.layers.rank_loss(label, left, right)

-
    """
    helper = LayerHelper('rank_loss', **locals())

@@ -7269,7 +7306,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):

    .. math::

-        rank\_loss &= max(0, -label * (left - right) + margin)
+        rank\_loss = max(0, -label * (left - right) + margin)

    Args:
       label (Variable): Indicates whether the left is ranked higher than the right or not.
@@ -7278,12 +7315,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
       margin (float): Indicates the given margin.
       name (str|None): A name for this layer (optional). If set None, the layer
                       will be named automatically.
+
    Returns:
       Variable: The ranking loss.
+
    Raises:
       ValueError: Any of label, left, and right is not a Variable.
+
    Examples:
+
        .. code-block:: python
+
           label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
           left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
           right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
@@ -7587,7 +7629,8 @@ def prelu(x, mode, param_attr=None, name=None):
    """
    Equation:

-        y = \max(0, x) + alpha * \min(0, x)
+    .. math::
+        y = \max(0, x) + \\alpha * \min(0, x)

    Args:
        x (Variable): The input tensor.
@@ -7730,20 +7773,29 @@ def flatten(x, axis=1, name=None):
    **Flatten layer**
    Flattens the input tensor into a 2D matrix.
    
-    Examples:
+    For Example:
+    
+    .. code-block:: text
+
        Case 1:
+
          Given
            X.shape = (3, 100, 100, 4)
+
          and
            axis = 2
+
          We get:
            Out.shape = (3 * 100, 4 * 100)

        Case 2:
+
          Given
            X.shape = (3, 100, 100, 4)
+
          and
            axis = 0
+
          We get:
            Out.shape = (1, 3 * 100 * 100 * 4)

@@ -7759,9 +7811,9 @@ def flatten(x, axis=1, name=None):
                        will be named automatically.

    Returns:
-        Variable: A 2D tensor with the contents of the input tensor, with input
-                  dimensions up to axis flattened to the outer dimension of
-                  the output and remaining input dimensions flattened into the
+        Variable: A 2D tensor with the contents of the input tensor, with input \
+                  dimensions up to axis flattened to the outer dimension of \
+                  the output and remaining input dimensions flattened into the \
                  inner dimension of the output.

    Raises:
@@ -7801,15 +7853,19 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
    The enumerated sequence has the same 1st dimension with variable `input`, and
    the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.

-    Examples:
+    .. code-block:: text
+
        Case 1:
+
          Input:
            X.lod = [[0, 3, 5]]
            X.data = [[1], [2], [3], [4], [5]]
            X.dims = [5, 1]
+
          Attrs:
            win_size = 2
            pad_value = 0
+
          Output:
            Out.lod = [[0, 3, 5]]
            Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
@@ -8896,6 +8952,7 @@ def similarity_focus(input, axis, indexes, name=None):
    SimilarityFocus Operator

    Generate a similarity focus mask with the same shape of input using the following method:
+    
    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
       to the axis according to the indexes. For example, if axis=1 and indexes=[a],
       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
@@ -8969,14 +9026,16 @@ def similarity_focus(input, axis, indexes, name=None):
        indexes(list): Indicating the indexes of the selected dimension.

    Returns:
-        Variable: A tensor variable with the same shape and same type
+        Variable: A tensor variable with the same shape and same type \
                  as the input.

    Examples:
        .. code-block:: python
+
            data = fluid.layers.data(
              name='data', shape=[2, 3, 2, 2], dtype='float32')
            x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
+
    """
    helper = LayerHelper('similarity_focus', **locals())
    # check attrs
@@ -9055,6 +9114,7 @@ def hash(input, hash_size, num_hash=1, name=None):

    Examples:
       .. code-block:: python
+
           word_dict = paddle.dataset.imdb.word_dict()
           x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
           out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
@@ -9075,13 +9135,15 @@ def hash(input, hash_size, num_hash=1, name=None):
 def grid_sampler(x, grid, name=None):
    """
    This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually gennerated by affine_grid. The grid of
+    flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
    shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
    with shape [N, H, W] each, where grid_x is indexing the 4th dimension
    (in width dimension) of input data x and grid_y is indexng the 3rd
    dimention (in height dimension), finally results is the bilinear
    interpolation value of 4 nearest corner points.

+    .. code-block:: text
+
        Step 1:
        Get (x, y) grid coordinates and scale to [0, H-1/W-1].

@@ -9126,16 +9188,18 @@ def grid_sampler(x, grid, name=None):
        name (str, default None): The name of this layer.

    Returns:
-        out(Variable): Output of shape [N, C, H, W] data samples input X
+        Variable: Output of shape [N, C, H, W] data samples input X
        using bilnear interpolation based on input grid.

-    Exmples:
+    Examples:
+
        .. code-block:: python

            x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
            theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
            grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
            out = fluid.layers.grid_sampler(x=x, grid=grid)
+
    """
    helper = LayerHelper("grid_sampler", **locals())

@@ -9203,19 +9267,19 @@ def add_position_encoding(input, alpha, beta, name=None):
    """
    **Add Position Encoding Layer**

-    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an
    output Tensor of shape [N x M x P] with positional encoding value.

-    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+    Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .

    .. math::
-        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
-        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
-        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+        PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})}  \\\\
+        Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)

    Where:
-    * PE(pos, 2i): the increment for the number at even position
-    * PE(pos, 2i + 1): the increment for the number at odd position
+      - :math:`PE(pos, 2i)` : the increment for the number at even position
+      - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position

    Args:
        input (Variable): 3-D input tensor with shape [N x M x P]
@@ -9230,6 +9294,7 @@ def add_position_encoding(input, alpha, beta, name=None):
        .. code-block:: python

          position_tensor = fluid.layers.add_position_encoding(input=tensor)
+
    """
    helper = LayerHelper('add_position_encoding', **locals())
    dtype = helper.input_dtype()
@@ -9262,13 +9327,13 @@ def bilinear_tensor_product(x,
    For example:

    .. math::
-       out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+       out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1

    In this formula:
      - :math:`x`: the first input contains M elements, shape is [batch_size, M].
      - :math:`y`: the second input contains N elements, shape is [batch_size, N].
      - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
-      - :math:`out{i}`: the i-th element of out, shape is [batch_size, size].
+      - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.

    Args:

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input,

    It also sets *stop_gradient* to True.

-    >>> data = fluid.layers.fill_constant_batch_size_like(
-    >>>             input=like, shape=[1], value=0, dtype='int64')
-
    Args:
        input(${input_type}): ${input_comment}.

@@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input,

    Returns:
        ${out_comment}.
+
+    Examples:
+
+        .. code-block:: python
+
+             data = fluid.layers.fill_constant_batch_size_like(
+                         input=like, shape=[1], value=0, dtype='int64')
+
    """
    helper = LayerHelper("fill_constant_batch_size_like", **locals())
    out = helper.create_variable_for_type_inference(dtype=dtype)

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -362,7 +362,7 @@ class ChunkEvaluator(MetricBase):
    compute the precision recall and F1-score using the accumulated counter
    numbers.
    For some basics of chunking, please refer to 
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
    ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.

@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase):
    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
        """
        Update the states based on the layers.chunk_eval() ouputs.
+
        Args:
            num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
            num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
@@ -450,9 +451,9 @@ class EditDistance(MetricBase):
                distance, instance_error = distance_evaluator.eval()

    In the above example:
-        'distance' is the average of the edit distance in a pass.

-        'instance_error' is the instance error rate in a pass.
+        - 'distance' is the average of the edit distance in a pass.
+        - 'instance_error' is the instance error rate in a pass.

    """

@@ -567,12 +568,15 @@ class DetectionMAP(object):
    Calculate the detection mean average precision (mAP).

    The general steps are as follows:
+
    1. calculate the true positive and false positive according to the input
       of detection and labels.
    2. calculate mAP value, support two versions: '11 point' and 'integral'.

    Please get more information from the following articles:
+
      https://sanchom.wordpress.com/tag/average-precision/
+
      https://arxiv.org/abs/1512.02325

    Args:
@@ -615,8 +619,10 @@ class DetectionMAP(object):

    In the above example:

-            'cur_map_v' is the mAP of current mini-batch.
-            'accum_map_v' is the accumulative mAP of one pass.
+            - 'cur_map_v' is the mAP of current mini-batch.
+            - 'accum_map_v' is the accumulative mAP of one pass.
+
+ 
    """

    def __init__(self,

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size):

 class DistributeTranspilerConfig(object):
    """
-    Args:
-        slice_var_up (bool): Do Tensor slice for pservers, default is True.
-        split_method (PSDispatcher): RoundRobin or HashName can be used
-          try to choose the best method to balance loads for pservers.
-        min_block_size (int): Minimum splitted element number in block.
-          According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
+    .. py:attribute:: slice_var_up (bool)
+
+          Do Tensor slice for pservers, default is True.
+
+    .. py:attribute:: split_method (PSDispatcher)
+
+          RoundRobin or HashName can be used.
+          Try to choose the best method to balance loads for pservers.
+
+    .. py:attribute:: min_block_size (int)
+
+          Minimum number of splitted elements in block.
+
+          According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
          We can use bandwidth effiently when data size is larger than 2MB.If you
-          want to change it, please be sure you see the slice_variable function.
+          want to change it, please be sure you have read the slice_variable function.
+
    """

    slice_var_up = True