[Docs]fix math api en docs issue (#47448)

* fix_docx_stanh * fix einsum api en docs issue * fix model api en docs issue * for codestyle * fix_einsum.py_einsum, test=document_fix * fix_model.py_Model, test=ducument_fix * fix_creation.py_meshgrid, test=document_fix * fix_linalg.py_slogdet, test=document_fix * fix_loss.py_SoftMarginLoss_CrossEntropyLoss_NLLLoss_BCELoss, test=document_fix * norm.py_SyncBatchNorm, test=document-fix * norm.py_SyncBatchNorm, test=document_fix * norm.py_SyncBatchNorm, test=document_fix * list18-30, test=document_fix * refix_list1-15, test=document_fix * deletefiles, test=document_fix * fixedapi_pre-commit, test=document_fix * fix_list31-45, test=document_fix * list111, test=document_fix * some_fix, test=document_fix * some_fix, test=document_fix * somefix, test=document_fix * somefix, test=document_fix * refix, test=document_fix * refix, test=document_fix * refix, test=document_fix * refix, test=document_fix * rerfix, test=document_fix Co-authored-by: Ligoml <limengliu@tiaozhan.com>

[Docs]fix math api en docs issue (#47448)
* fix_docx_stanh * fix einsum api en docs issue * fix model api en docs issue * for codestyle * fix_einsum.py_einsum, test=document_fix * fix_model.py_Model, test=ducument_fix * fix_creation.py_meshgrid, test=document_fix * fix_linalg.py_slogdet, test=document_fix * fix_loss.py_SoftMarginLoss_CrossEntropyLoss_NLLLoss_BCELoss, test=document_fix * norm.py_SyncBatchNorm, test=document-fix * norm.py_SyncBatchNorm, test=document_fix * norm.py_SyncBatchNorm, test=document_fix * list18-30, test=document_fix * refix_list1-15, test=document_fix * deletefiles, test=document_fix * fixedapi_pre-commit, test=document_fix * fix_list31-45, test=document_fix * list111, test=document_fix * some_fix, test=document_fix * some_fix, test=document_fix * somefix, test=document_fix * somefix, test=document_fix * refix, test=document_fix * refix, test=document_fix * refix, test=document_fix * refix, test=document_fix * rerfix, test=document_fix Co-authored-by: Ligoml <limengliu@tiaozhan.com>
94c6ec86 · ustiniankw · GitHub · 51b08123 · 94c6ec86 · 94c6ec86
24 changed file
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -25,12 +25,13 @@ _HYBRID_PARALLEL_GROUP = None
 class ParallelMode:
    """
    There are all the parallel modes currently supported:
        - DATA_PARALLEL: Distribute input data to different devices.
        - TENSOR_PARALLEL: Shards tensors in the network to different devices.
        - PIPELINE_PARALLEL: Place different layers of the network on different devices.
-    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states
+        - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
-                         corresponding to the parameters to each device.
    Examples:
        .. code-block:: python

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -97,6 +97,7 @@ def _check_var_exists(var_name):
 def init_parallel_env():
    """
    Initialize parallel training environment in dynamic graph mode.
    Note:
@@ -112,6 +113,7 @@ def init_parallel_env():
    Examples:
        .. code-block:: python
            # required: gpu
            import paddle
            import paddle.nn as nn
@@ -152,6 +154,7 @@ def init_parallel_env():
            if __name__ == '__main__':
                dist.spawn(train)
    """
    # 0. get env & check world size

--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -236,13 +236,13 @@ def send_ue_recv(
        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
                            The available data type is int32, int64.
-        message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
+        message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
-        reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
+        reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                         Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
+        out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
                                    out_size is smaller or equal to 0, then this input will not be used.
                                    Otherwise, `out_size` should be equal with or larger than
-                                    max(dst_index) + 1.
+                                    max(dst_index) + 1. Default value is `None`.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
@@ -25,6 +25,7 @@ def reindex_graph(
    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
    """
    Reindex Graph API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -48,12 +49,12 @@ def reindex_graph(
                            should be the same with `x`.
        count (Tensor): The neighbor count of the input nodes `x`. And the
                        data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                    and should be filled with -1. Only useful for gpu version.
                                    `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -68,6 +69,7 @@ def reindex_graph(
        .. code-block:: python
            import paddle
            x = [0, 1, 2]
            neighbors = [8, 9, 0, 4, 7, 6, 7]
            count = [2, 3, 2]
@@ -137,6 +139,7 @@ def reindex_heter_graph(
    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
    """
    Reindex HeterGraph API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -160,12 +163,12 @@ def reindex_heter_graph(
                                The data type should be the same with `x`.
        count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
                            And the data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                    and should be filled with -1. Only useful for gpu version.
                                    `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -182,6 +185,7 @@ def reindex_heter_graph(
        .. code-block:: python
            import paddle
            x = [0, 1, 2]
            neighbors_a = [8, 9, 0, 4, 7, 6, 7]
            count_a = [2, 3, 2]

--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -31,6 +31,7 @@ def sample_neighbors(
    name=None,
 ):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -51,16 +52,16 @@ def sample_neighbors(
                         The data type should be the same with `row`.
        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                              data type should be the same with `row`.
-        sample_size (int): The number of neighbors we need to sample. Default value is -1,
+        sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
                           which means returning all the neighbors of the input nodes.
-        eids (Tensor): The eid information of the input graph. If return_eids is True,
+        eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
                            then `eids` should not be None. The data type should be the
                            same with `row`. Default is None.
-        return_eids (bool): Whether to return eid information of sample edges. Default is False.
+        return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
-        perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
+        perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
                              is True, then `perm_buffer` should not be None. The data type should
                              be the same with `row`. If not None, we will use fiser-yates sampling
-                              to speed up. Only useful for gpu version.
+                              to speed up. Only useful for gpu version. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -77,6 +78,7 @@ def sample_neighbors(
        .. code-block:: python
            import paddle
            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -258,7 +258,9 @@ def _update_input_info(inputs):
 class StaticGraphAdapter:
    """
    Model traning/inference with a static graph.
    """
    def __init__(self, model):
@@ -1005,6 +1007,7 @@ class DynamicGraphAdapter:
 class Model:
    """
    An Model object is network with training and inference features.
    Dynamic graph and static graph are supported at the same time,
    switched by `paddle.enable_static()`. The usage is as follows.
@@ -1145,6 +1148,7 @@ class Model:
    def train_batch(self, inputs, labels=None, update=True):
        """
        Run one training step on one batch of data. And using `update` indicates
        whether optimizer update gradients computing by this batch.
@@ -1190,6 +1194,7 @@ class Model:
                loss = model.train_batch([data], [label])
                print(loss)
                # [array([2.192784], dtype=float32)]
        """
        loss = self._adapter.train_batch(inputs, labels, update)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1199,6 +1204,7 @@ class Model:
    @no_grad()
    def eval_batch(self, inputs, labels=None):
        """
        Run one evaluating step on a batch of data.
        Args:
@@ -1242,6 +1248,7 @@ class Model:
                loss, acc = model.eval_batch([data], [label])
                print(loss, acc)
                # [array([2.8825705], dtype=float32)] [0.0]
        """
        loss = self._adapter.eval_batch(inputs, labels)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1251,6 +1258,7 @@ class Model:
    @no_grad()
    def predict_batch(self, inputs):
        """
        Run one predicting step on a batch of data.
        Args:
@@ -1289,6 +1297,7 @@ class Model:
                # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
                #          0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
                #          dtype=float32)]
        """
        loss = self._adapter.predict_batch(inputs)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1297,6 +1306,7 @@ class Model:
    def save(self, path, training=True):
        """
        This function saves parameters, optimizer information or model and
        paramters only for inference to path. It depends on the parameter
        `training`.
@@ -1364,6 +1374,7 @@ class Model:
                model.fit(data, epochs=1, batch_size=32, verbose=0)
                model.save('checkpoint/test')  # save for training
                model.save('inference_model', False)  # save for inference
        """
        if ParallelEnv().local_rank == 0:
@@ -1374,6 +1385,7 @@ class Model:
    def load(self, path, skip_mismatch=False, reset_optimizer=False):
        """
        Load from files storing the model states and optimizer states. The file
        for optimizer states is not necessary if no need to restore the optimizer.
@@ -1421,6 +1433,7 @@ class Model:
                model.save('checkpoint/test')
                model.load('checkpoint/test')
        """
        def _load_state_from_path(path):
@@ -1491,6 +1504,7 @@ class Model:
    def parameters(self, *args, **kwargs):
        """
        Returns a list of parameters of the model.
        Returns:
@@ -1513,6 +1527,7 @@ class Model:
                    nn.Linear(200, 10)), input)
                params = model.parameters()
        """
        return self._adapter.parameters()
@@ -1609,6 +1624,7 @@ class Model:
        self, optimizer=None, loss=None, metrics=None, amp_configs=None
    ):
        """
        Configures the model before runing.
        Args:
@@ -1640,6 +1656,7 @@ class Model:
        Returns:
            None
        """
        self._place = _get_device()
        if isinstance(self._place, fluid.CUDAPlace):
@@ -1699,6 +1716,7 @@ class Model:
        num_iters=None,
    ):
        """
        Trains the model for a fixed number of epochs. If `eval_data` is set,
        evaluation will be done at the end of each epoch.
@@ -1753,7 +1771,7 @@ class Model:
               How to make a batch is done internally.
            .. code-block:: python
-              :name: code-example1
+              :name: code-example3
                import paddle
                import paddle.vision.transforms as T
@@ -1793,7 +1811,7 @@ class Model:
               DataLoader.
            .. code-block:: python
-              :name: code-example2
+              :name: code-example4
                import paddle
                import paddle.vision.transforms as T
@@ -1830,6 +1848,7 @@ class Model:
                            val_loader,
                            epochs=2,
                            save_dir='mnist_checkpoint')
        """
        assert train_data is not None, "train_data must be given!"

--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -37,6 +37,7 @@ def graph_sample_neighbors(
    name=None,
 ):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -72,14 +73,13 @@ def graph_sample_neighbors(
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        out_neighbors (Tensor): The sample neighbors of the input nodes.
+        - out_neighbors (Tensor): The sample neighbors of the input nodes.
-        out_count (Tensor): The number of sampling neighbors of each input node, and the shape
+        - out_count (Tensor): The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
-                            should be the same with `input_nodes`.
+        - out_eids (Tensor): If `return_eids` is True, we will return the eid information of the sample edges.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the
-                           sample edges.
    Examples:
        .. code-block:: python
            import paddle
            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)

--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -710,6 +710,7 @@ def upsample(
    name=None,
 ):
    """
    This API resizes a batch of images.
    The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
@@ -720,11 +721,12 @@ def upsample(
    and the resizing only applies on the three dimensions(depth, height and width).
    Supporting resample methods:
-        'linear' : Linear interpolation
+    - 'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
+    - 'bilinear' : Bilinear interpolation
-        'trilinear' : Trilinear interpolation
+    - 'trilinear' : Trilinear interpolation
-        'nearest' : Nearest neighbor interpolation
+    - 'nearest' : Nearest neighbor interpolation
-        'bicubic' : Bicubic interpolation
+    - 'bicubic' : Bicubic interpolation
    Linear interpolation is the method of using a line connecting two known quantities
    to determine the value of an unknown quantity between the two known quantities.
@@ -826,8 +828,9 @@ def upsample(
                D_out = D_{in} * scale_{factor}
                H_out = H_{in} * scale_{factor}
                W_out = W_{in} * scale_{factor}
-    https://en.wikipedia.org/wiki/Linear_interpolation.
    For details of linear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Linear_interpolation.
    For details of nearest neighbor interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
@@ -871,6 +874,7 @@ def upsample(
        name(str, optional): The default value is None.
                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
        A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),

--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -23,6 +23,7 @@ __all__ = []
 def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None):
    r"""
    It computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:
@@ -48,6 +49,7 @@ def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None):
    Returns:
        Tensor, the dtype is same as input tensor.
        - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
          depending on whether the input has data shaped as :math:`[N, D]`.
        - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1305,6 +1305,7 @@ def margin_ranking_loss(
 def l1_loss(input, label, reduction='mean', name=None):
    r"""
    Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
    If `reduction` set to ``'none'``, the loss is:
@@ -1336,7 +1337,7 @@ def l1_loss(input, label, reduction='mean', name=None):
    Returns:
        Tensor, the L1 Loss of Tensor ``input`` and ``label``.
-        If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+        If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
        If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
    Examples:
@@ -1359,6 +1360,7 @@ def l1_loss(input, label, reduction='mean', name=None):
            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
            print(l1_loss.numpy())
            # [1.4]
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(
@@ -2281,6 +2283,7 @@ def cross_entropy(
    name=None,
 ):
    r"""
    By default, this operator implements the cross entropy loss function with softmax. This function
    combines the calculation of the softmax operation and the cross entropy loss function
    to provide a more numerically stable computing.
@@ -2394,21 +2397,13 @@ def cross_entropy(
    Parameters:
+        input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
-        - **input** (Tensor)
-            Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
            Note:
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
-                output of softmax operator, which will produce incorrect results.
                2. when use_softmax=False, it expects the output of softmax operator.
-        - **label** (Tensor)
+        label (Tensor):
            1. If soft_label=False, the shape is
            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
            the data type is int32, int64, float32, float64, where each value is [0, C-1].
@@ -2416,48 +2411,27 @@ def cross_entropy(
            2. If soft_label=True, the shape and data type should be same with ``input`` ,
            and the sum of the labels for each sample should be 1.
-        - **weight** (Tensor, optional)
+        weight (Tensor, optional): a manual rescaling weight given to each class.
-            a manual rescaling weight given to each class.
            If given, has to be a Tensor of size C and the data type is float32, float64.
            Default is ``'None'`` .
+        ignore_index (int64, optional): Specifies a target value that is ignored
-        - **ignore_index** (int64, optional)
-            Specifies a target value that is ignored
            and does not contribute to the loss. A negative value means that no label
            value needs to be ignored. Only valid when soft_label = False.
            Default is ``-100`` .
+        reduction (str, optional): Indicate how to average the loss by batch_size,
-        - **reduction** (str, optional)
-            Indicate how to average the loss by batch_size,
            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
            Default is ``'mean'``.
+        soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
-        - **soft_label** (bool, optional)
+        axis (int, optional):The index of dimension to perform softmax calculations.
-            Indicate whether label is soft.
-            Default is ``False``.
-        - **axis** (int, optional)
-            The index of dimension to perform softmax calculations.
            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
            number of dimensions of input :attr:`input`.
            Default is ``-1`` .
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
-        - **use_softmax** (bool, optional)
-            Indicate whether compute softmax before cross_entropy.
            Default is ``True``.
+        name (str, optional): The name of the operator. Default is ``None`` .
-        - **name** (str, optional)
-            The name of the operator. Default is ``None`` .
            For more information, please refer to :ref:`api_guide_Name` .
    Returns:
@@ -2473,9 +2447,7 @@ def cross_entropy(
        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
    Examples:
        .. code-block:: python
            # hard labels
@@ -3958,6 +3930,7 @@ def multi_margin_loss(
 def soft_margin_loss(input, label, reduction='mean', name=None):
    """
    The API measures the soft margin loss between input predictions ``input``
    and target labels ``label`` . It can be described as:
@@ -3966,7 +3939,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
    Parameters:
-        input (Tensor): The input predications tensor with shape: [N, *],
+        input (Tensor): The input predications tensor with shape: ``[N, *]``,
            N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
            Available dtype is float32, float64.
@@ -3986,8 +3959,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
    Returns:
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
-            same as ``input`` , else the shape of output is [1].
    Examples:
        .. code-block:: python
@@ -4013,6 +3985,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
            #         [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
            #         [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
            #         [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(

--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1735,15 +1735,17 @@ def adaptive_avg_pool1d(x, output_size, name=None):
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
    r"""
    Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size.
    For avg adaptive pool2d:
    ..  math::
-        hstart &= floor(i * H_{in} / H_{out})
+        hstart &= floor(i * H_{in} / H_{out}) \\
-        hend &= ceil((i + 1) * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out}) \\
-        wstart &= floor(j * W_{in} / W_{out})
+        wstart &= floor(j * W_{in} / W_{out}) \\
-        wend &= ceil((j + 1) * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out}) \\
        Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
    Args:
@@ -1752,14 +1754,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
            it must contain two element, (H, W). H and W can be either a int, or None which means
            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format (str, optional): The data format of the input and output data. An optional string
            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
            the order of: [batch_size, input_channels, input_height, input_width].
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
    Returns:
-        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
    Examples:
        .. code-block:: python
@@ -1787,6 +1790,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
                            x = x,
                            output_size=[3, 3])
            # out.shape is [2, 3, 3, 3]
    """
    if not in_dynamic_mode():
        check_variable_and_dtype(
@@ -1879,34 +1883,36 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
    r"""
    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size.
    For avg adaptive pool3d:
    ..  math::
-        dstart &= floor(i * D_{in} / D_{out})
+        dstart &= floor(i * D_{in} / D_{out}) \\
-        dend &= ceil((i + 1) * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out}) \\
-        hstart &= floor(j * H_{in} / H_{out})
+        hstart &= floor(j * H_{in} / H_{out}) \\
-        hend &= ceil((j + 1) * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out}) \\
-        wstart &= floor(k * W_{in} / W_{out})
+        wstart &= floor(k * W_{in} / W_{out}) \\
-        wend &= ceil((k + 1) * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out}) \\
        Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
            {(dend - dstart) * (hend - hstart) * (wend - wstart)}
    Args:
        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
            The data type can be float32, float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            list, it must contain three elements, (D, H, W). D, H and W can be either a int,
-            the size will be the same as that of the input.
+            or None which means the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format (str, optional): The data format of the input and output data. An optional string
            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
+            Usually name is no need to set and None by default.
-                             None by default.
    Returns:
-        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
    Examples:
        .. code-block:: python
@@ -1936,6 +1942,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
                            x = input_data,
                            output_size=[3, 3, 3])
            # out.shape is [2, 3, 3, 3, 3]
    """
    if not in_dynamic_mode():
        check_variable_and_dtype(

--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -1449,15 +1449,16 @@ class Maxout(Layer):
 class Softmax2D(Layer):
    r"""
    Softmax2D Activation.
    Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
    The sum of result in each location (C, H_i, W_j) will be one.
    Shape:
        - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
-        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input)
+        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)
-    Return:
+    Returns:
        A Tensor of the same shape and dtype as input with value in range [0, 1].
    Examples:
@@ -1482,6 +1483,7 @@ class Softmax2D(Layer):
            #   [[0.42368975 0.51082766 0.47752273 0.5258871 ]
            #    [0.66754097 0.47182566 0.5187628  0.5402329 ]
            #    [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
    """
    def __init__(self, name=None):

--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -20,6 +20,7 @@ __all__ = []
 class PairwiseDistance(Layer):
    r"""
    It computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:
@@ -38,10 +39,10 @@ class PairwiseDistance(Layer):
            Generally, no setting is required. Default: None.
    Shape:
-        x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
+        - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
          is the dimension of the data. Available data type is float32, float64.
-        y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
+        - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
-        output: The same dtype as input tensor.
+        - output: The same dtype as input tensor.
            - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
              depending on whether the input has data shaped as :math:`[N, D]`.
            - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -26,7 +26,8 @@ __all__ = []
 class BCEWithLogitsLoss(Layer):
    r"""
-    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
    layer and some reduce operations.
@@ -81,21 +82,21 @@ class BCEWithLogitsLoss(Layer):
            For more information, please refer to :ref:`api_guide_Name`.
    Shapes:
-        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+        - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
          N is batch_size, `*` means number of additional dimensions. The ``logit``
          is usually the output of Linear layer. Available dtype is float32, float64.
-        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+        - label (Tensor): The target labels tensor. 2-D tensor with the same shape as
          ``logit``. The target labels which values should be numbers between 0 and 1.
          Available dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
          same as ``logit`` , else the shape of output is scalar.
    Returns:
        A callable object of BCEWithLogitsLoss.
    Examples:
        .. code-block:: python
            import paddle
            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
@@ -134,6 +135,7 @@ class BCEWithLogitsLoss(Layer):
 class CrossEntropyLoss(Layer):
    r"""
    By default, this operator implements the cross entropy loss function with softmax. This function
    combines the calculation of the softmax operation and the cross entropy loss function
    to provide a more numerically stable computing.
@@ -246,60 +248,35 @@ class CrossEntropyLoss(Layer):
    Parameters:
+        weight (Tensor, optional): a manual rescaling weight given to each class.
-        - **weight** (Tensor, optional)
-            a manual rescaling weight given to each class.
            If given, has to be a Tensor of size C and the data type is float32, float64.
            Default is ``'None'`` .
+        ignore_index (int64, optional): Specifies a target value that is ignored
-        - **ignore_index** (int64, optional)
-            Specifies a target value that is ignored
            and does not contribute to the loss. A negative value means that no label
            value needs to be ignored. Only valid when soft_label = False.
            Default is ``-100`` .
+        reduction (str, optional): Indicate how to average the loss by batch_size,
-        - **reduction** (str, optional)
-            Indicate how to average the loss by batch_size,
            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
            Default is ``'mean'``.
+        soft_label (bool, optional): Indicate whether label is soft.
-        - **soft_label** (bool, optional)
-            Indicate whether label is soft.
            If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
            Default is ``False``.
+        axis (int, optional): The index of dimension to perform softmax calculations.
-        - **axis** (int, optional)
-            The index of dimension to perform softmax calculations.
            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
            of dimensions of input :attr:`input`.
            Default is ``-1`` .
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
-        - **use_softmax** (bool, optional)
-            Indicate whether compute softmax before cross_entropy.
            Default is ``True``.
+        name (str, optional): The name of the operator. Default is ``None`` .
-        - **name** (str, optional)
-            The name of the operator. Default is ``None`` .
            For more information, please refer to :ref:`api_guide_Name` .
    Shape:
+        - **input** (Tensor), the data type is float32, float64. Shape is
-        - **input** (Tensor)
-            Input tensor, the data type is float32, float64. Shape is
          :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
            Note:
                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
@@ -307,7 +284,6 @@ class CrossEntropyLoss(Layer):
                2. when use_softmax=False, it expects the output of softmax operator.
        - **label** (Tensor)
            1. If soft_label=False, the shape is
@@ -317,14 +293,9 @@ class CrossEntropyLoss(Layer):
            2. If soft_label=True, the shape and data type should be same with ``input`` ,
            and the sum of the labels for each sample should be 1.
-        - **output** (Tensor)
+        - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
-            Return the softmax cross_entropy loss of ``input`` and ``label``.
          The data type is the same as input.
          If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
          If :attr:`reduction` is ``'none'``:
            1. If soft_label = False, the dimension of return value is the same with ``label`` .
@@ -629,6 +600,7 @@ class MSELoss(Layer):
 class L1Loss(Layer):
    r"""
    Construct a callable object of the ``L1Loss`` class.
    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
@@ -658,10 +630,10 @@ class L1Loss(Layer):
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Shape:
-        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
-        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
-        output (Tensor): The L1 Loss of ``input`` and ``label``.
+        - output (Tensor): The L1 Loss of ``input`` and ``label``.
-            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+          If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
          If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
    Examples:
@@ -687,6 +659,7 @@ class L1Loss(Layer):
            print(output)
            # [[0.20000005 0.19999999]
            # [0.2        0.79999995]]
    """
    def __init__(self, reduction='mean', name=None):
@@ -707,6 +680,7 @@ class L1Loss(Layer):
 class BCELoss(Layer):
    """
    This interface is used to construct a callable object of the ``BCELoss`` class.
    The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
    and target labels ``label`` . The binary_cross_entropy loss can be described as:
@@ -750,13 +724,13 @@ class BCELoss(Layer):
            For more information, please refer to :ref:`api_guide_Name`.
    Shape:
-        input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means
+        - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
          number of additional dimensions. The input ``input`` should always
          be the output of sigmod.  Available dtype is float32, float64.
-        label (Tensor): 2-D tensor with the same shape as ``input``. The target
+        - label (Tensor): 2-D tensor with the same shape as ``input``. The target
          labels which values should be numbers between 0 and 1. Available
          dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
          same as ``input`` , else the shape of output is scalar.
    Returns:
@@ -909,6 +883,7 @@ class NLLLoss(Layer):
 class KLDivLoss(Layer):
    r"""
    Generate a callable object of 'KLDivLoss' to calculate the
    Kullback-Leibler divergence loss between Input(X) and
    Input(Target). Notes that Input(X) is the log-probability
@@ -928,14 +903,10 @@ class KLDivLoss(Layer):
             Default is ``'mean'``.
    Shape:
+        - input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
-        - input (Tensor): (N, *), where * means, any number of additional dimensions.
+        - label (Tensor): ``(N, *)``, same shape as input.
-        - label (Tensor): (N, *), same shape as input.
        - output (Tensor): tensor with shape: [1] by default.
    Examples:
        .. code-block:: python
@@ -965,6 +936,7 @@ class KLDivLoss(Layer):
            kldiv_criterion = nn.KLDivLoss(reduction='none')
            pred_loss = kldiv_criterion(x, target)
            # shape=[5, 20]
    """
    def __init__(self, reduction='mean'):
@@ -1817,6 +1789,7 @@ class MultiMarginLoss(Layer):
 class SoftMarginLoss(Layer):
    r"""
    Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
    and target labels ``label`` . It can be described as:
@@ -1835,16 +1808,13 @@ class SoftMarginLoss(Layer):
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Shapes:
+        - Input (Tensor): The input tensor with shape: ``[N, *]``,
-        Input (Tensor): The input tensor with shape: [N, *],
          N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
          Available dtype is float32, float64.
+        - Label (Tensor): The target labels tensor with the same shape as
-        Label (Tensor): The target labels tensor with the same shape as
          ``input``. The target labels which values should be numbers -1 or 1.
          Available dtype is int32, int64, float32, float64.
+        - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
          same as ``input`` , else the shape of output is [1].
    Returns:
@@ -1877,6 +1847,7 @@ class SoftMarginLoss(Layer):
            #         [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511],
            #         [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399],
            #         [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]])
    """
    def __init__(self, reduction='mean', name=None):

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -318,6 +318,7 @@ Where `H` means height of feature map, `W` means width of feature map.
 class GroupNorm(Layer):
    """
    This interface is used to construct a callable object of the ``GroupNorm`` class.
    For more details, refer to code examples.
    It implements the function of the Group Normalization Layer.
@@ -338,7 +339,7 @@ class GroupNorm(Layer):
        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
    Shape:
-        - x: Tensor with shape: (batch, num_features, *).
+        - x: Tensor with shape: attr:`(batch, num_features, *)`.
        - output: The same shape as input x.
    Returns:
@@ -1041,6 +1042,7 @@ class BatchNorm3D(_BatchNormBase):
 class SyncBatchNorm(_BatchNormBase):
    r"""
    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
    be used as a normalizer function for other operations, such as conv2d and fully connected
@@ -1086,9 +1088,9 @@ class SyncBatchNorm(_BatchNormBase):
    - :math:`\beta` : trainable shift parameter vector
    Note:
-        If you want to use container to pack your model and has ``SyncBatchNorm`` in the
+        If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the
-        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of
+        evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of
-        ``list`` to pack the model.
+        :ref:`api_paddle_hub_list` to pack the model.
    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -1106,8 +1108,8 @@ class SyncBatchNorm(_BatchNormBase):
             have trainable bias parameter. Default: None.
    Shapes:
-        input: Tensor that the dimension from 2 to 5.
+        - input: Tensor that the dimension from 2 to 5.
-        output: Tensor with the same shape as input.
+        - output: Tensor with the same shape as input.
    Examples:
        .. code-block:: python
@@ -1129,6 +1131,7 @@ class SyncBatchNorm(_BatchNormBase):
                #          [[ 0.80956620, -0.66528702],
                #           [-1.27446556,  1.13018656]]]])
    """
    def __init__(
@@ -1277,8 +1280,8 @@ class SyncBatchNorm(_BatchNormBase):
            The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
        Examples:
            .. code-block:: python
                import paddle
                import paddle.nn as nn

--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -223,6 +223,7 @@ class AvgPool2D(Layer):
 class AvgPool3D(Layer):
    """
    This operation applies 3D max pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
    in NCDHW format, where N is batch size, C is the number of channels,
@@ -263,6 +264,7 @@ class AvgPool3D(Layer):
          The data type can be float32, float64.
        - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
          The data type is same as input x.
    Examples:
        .. code-block:: python

--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -613,14 +613,17 @@ class QuantizedConv2D(Layer):
 class QuantizedConv2DTranspose(Layer):
    """
    The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
    The only difference is that its inputs are all fake quantized.
    Examples:
       .. code-block:: python
          import paddle
          import paddle.nn as nn
          from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
          x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
          conv = nn.Conv2DTranspose(4, 6, (3, 3))
          conv_quantized = QuantizedConv2DTranspose(conv)
@@ -630,6 +633,7 @@ class QuantizedConv2DTranspose(Layer):
          y_np = y_var.numpy()
          print(y_np.shape, y_quantized_np.shape)
          # (2, 6, 10, 10), (2, 6, 10, 10)
    """
    def __init__(

--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1647,6 +1647,7 @@ class MultiplicativeDecay(LRScheduler):
 class OneCycleLR(LRScheduler):
    r"""
    Sets the learning rate according to the one cycle learning rate scheduler.
    The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
    from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
@@ -1660,22 +1661,25 @@ class OneCycleLR(LRScheduler):
    Also note that you should update learning rate each step.
    Args:
-        max_learning_rate (float): The maximum learning rate. It is a python float number.
+        max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
-             Functionally, it defines the initial learning rate by ``divide_factor`` .
        total_steps (int): Number of total training steps.
-        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
-        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
-            'linear' for linear annealing. Default: 'cos'.
        three_phase (bool, optional): Whether to use three phase.
            If ``True``:
                1. The learning rate will first increase from initial learning rate to maximum learning rate.
                2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
                3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
            If ``False``:
                1. The learning rate will increase to maximum learning rate.
                2. Then it will directly decrease to minimum learning rate.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -1727,6 +1731,7 @@ class OneCycleLR(LRScheduler):
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # You should update learning rate each step
    """
    def __init__(

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1194,6 +1194,7 @@ def triu(x, diagonal=0, name=None):
 def meshgrid(*args, **kwargs):
    """
    Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.
    Args:

--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -732,6 +732,7 @@ def preprocess(equation, *operands):
 def parse_fake_shape(equation, operands, labels):
    """
    this shape is just used for operands planning. may differ with the original shape.
    for example:
    ... is replaced by 1
@@ -739,6 +740,7 @@ def parse_fake_shape(equation, operands, labels):
    Results
    -------
    list of shape
    """
    shaped = collections.namedtuple('shaped', ['shape'])
@@ -862,6 +864,7 @@ def gen_einsum_op(equation, *operands):
 def einsum(equation, *operands):
    r"""
    einsum(equation, *operands)
    The current version of this API should be used in dygraph only mode.
@@ -897,8 +900,7 @@ def einsum(equation, *operands):
          dimensions into broadcasting dimensions.
        - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
          dimensions will be reduced and removed in the output.
-        - Output labels can be explicitly specified on the right hand side of `->` or omitted.
+        - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
-        In the latter case, the output labels will be inferred from the input labels.
            - Inference of output labels
                - Broadcasting label `...`, if present, is put on the leftmost position.
                - Free labels are reordered alphabetically and put after `...`.
@@ -912,6 +914,7 @@ def einsum(equation, *operands):
                  a free label.
                - For any free label which is not present for the output, it's lowered to
                  a dummy label.
        - Examples
            - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
              string is '...ik'
@@ -944,7 +947,7 @@ def einsum(equation, *operands):
            operands should equal the number of input terms in the equation.
    Returns:
-        result (`Tensor`): the result tensor.
+        result (`Tensor`), the result tensor.
    Examples:
        .. code-block:: python
@@ -1016,6 +1019,7 @@ def einsum(equation, *operands):
            #    [[0.32043904, 0.18164253, 0.27810261],
            #     [0.50226176, 0.24512935, 0.39881429],
            #     [0.51476848, 0.23367381, 0.39229113]]])
    """
    import os

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1905,12 +1905,15 @@ def mv(x, vec, name=None):
 def det(x, name=None):
    """
    Calculates determinant value of a square matrix or batches of square matrices.
    Args:
-        x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the
+        x (Tensor): the input matrix of size `(n, n)` or the
            batch of matrices of size `(*, n, n)` where `*` is one or more
            batch dimensions.
+        name(str, optional): Name of the output. Default is None. It's used
+            to print debug info for developers. Details: :ref:`api_guide_Name`
    Returns:
        Tensor, the determinant value of a square matrix or batches of square matrices.
@@ -1961,18 +1964,20 @@ def det(x, name=None):
 def slogdet(x, name=None):
    """
    Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
-    The determinant can be computed with ``sign * exp(logabsdet)
+    The determinant can be computed with ``sign * exp`` (logabsdet)
    Supports input of float, double
    Note that for matrices that have zero determinant, this returns ``(0, -inf)``
    Args:
        x (Tensor): the batch of matrices of size :math:`(*, n, n)`
            where math:`*` is one or more batch dimensions.
    Returns:
-        y (Tensor): A tensor containing the sign of the determinant and the natural logarithm
+        y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
        of the absolute value of determinant, respectively.
    Examples:
@@ -2090,6 +2095,7 @@ def svd(x, full_matrices=False, name=None):
 def matrix_power(x, n, name=None):
    r"""
    Computes the n-th power of a square matrix or a batch of square matrices.
    Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
@@ -2115,7 +2121,7 @@ def matrix_power(x, n, name=None):
            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its
+        - Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its
          data type should be the same as that of `x`.
    Examples:
@@ -3054,8 +3060,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 def solve(x, y, name=None):
    r"""
    Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
-    Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
+    Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
    a vector/matrix or a batch of vectors/matrices, the equation should be:
    .. math::
@@ -3064,9 +3071,9 @@ def solve(x, y, name=None):
    Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.
    Args:
-        x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
+        x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or
            more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or
+        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
            more batch dimensions. Its data type should be float32 or float64.
        name(str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -272,7 +272,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
-    """
+    r"""
    stanh activation.
    .. math::
@@ -283,8 +284,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
        x (Tensor): The input Tensor with data type float32, float64.
        scale_a (float, optional): The scale factor a of the input. Default is 0.67.
        scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .

--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -1296,6 +1296,7 @@ def distribute_fpn_proposals(
    name=None,
 ):
    r"""
    In Feature Pyramid Networks (FPN) models, it is needed to distribute
    all proposals into different FPN level, with respect to scale of the proposals,
    the referring scale and the referring level. Besides, to restore the order of
@@ -1303,8 +1304,9 @@ def distribute_fpn_proposals(
    in current proposals. To compute FPN level for each roi, the formula is given as follows:
    .. math::
-        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
-        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+        level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)
    where BBoxArea is a function to compute the area of each roi.
    Args:
@@ -1328,11 +1330,11 @@ def distribute_fpn_proposals(
            None by default.
    Returns:
-        multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
+        - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
          and data type is same as `fpn_rois` . The length is max_level-min_level+1.
-        restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
+        - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
          , where N is the number of total rois. The data type is int32.
-        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is
+        - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
          the RoIs' number in each image on the corresponding level. The shape
          is [B] and data type of int32, where B is the number of images.
@@ -1351,6 +1353,7 @@ def distribute_fpn_proposals(
                refer_level=4,
                refer_scale=224,
                rois_num=rois_num)
    """
    num_lvl = max_level - min_level + 1
@@ -2438,6 +2441,7 @@ def matrix_nms(
    name=None,
 ):
    """
    This operator does matrix non maximum suppression (NMS).
    First selects a subset of candidate bounding boxes that have higher scores
    than score_threshold (if provided), then the top k candidate is selected if
@@ -2445,6 +2449,7 @@ def matrix_nms(
    decayed according to the Matrix NMS scheme.
    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
    per image if keep_top_k is larger than -1.
    Args:
        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
                           predicted locations of M bounding bboxes,
@@ -2468,29 +2473,32 @@ def matrix_nms(
                         on score_threshold.
        keep_top_k (int): Number of total bboxes to be kept per image after NMS
                          step. -1 means keeping all bboxes after NMS step.
-        use_gaussian (bool): Use Gaussian as the decay function. Default: False
+        use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
-        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
+        gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
-        background_label (int): The index of background label, the background
+        background_label (int, optional): The index of background label, the background
                                label will be ignored. If set to -1, then all
                                categories will be considered. Default: 0
-        normalized (bool): Whether detections are normalized. Default: True
+        normalized (bool, optional): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
+        return_index(bool, optional): Whether return selected index. Default: False
-        return_rois_num(bool): whether return rois_num. Default: True
+        return_rois_num(bool, optional): whether return rois_num. Default: True
-        name(str): Name of the matrix nms op. Default: None.
+        name(str, optional): Name of the matrix nms op. Default: None.
    Returns:
-        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
+        - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
          otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
-        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
+        - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
          detection results.
-             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+          Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
-        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
+        - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
          selected indices, which are absolute values cross batches.
-        rois_num (Tensor): A 1-D Tensor with shape [N] containing
+        - rois_num (Tensor), A 1-D Tensor with shape [N] containing
          the number of detected boxes in each image.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.ops import matrix_nms
            boxes = paddle.rand([4, 1, 4])
            boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
            boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
@@ -2498,6 +2506,7 @@ def matrix_nms(
            out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
                                 score_threshold=0.5, post_threshold=0.1,
                                 nms_top_k=400, keep_top_k=200, normalized=False)
    """
    check_variable_and_dtype(
        bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'