[docs] fix some format issue (#45752)

* fix some error * fix * fix some error * fix bugs * fix some errors * fix * Update transform.py fix some docs errors * Update normal.py fix some doc errors * Update uniform.py fix some docs errors * Update kl.py fix some docs errors * Update math.py fix some docs error * Update math.py fix heaviside links * Update loss.py fix * Update transform.py fix bugs * Update math.py fix * fix some format issue * Update normal.py * fix missing np * order imports * fix some flake8 warning * Update python/paddle/tensor/math.py * fix OP-->API * fix op * fix grid_sample format * trim trailing whitespace * empty commit, test=document_fix * empty commit Co-authored-by: N SigureMo <sigure.qaq@gmail.com> Co-authored-by: N Ligoml <39876205+Ligoml@users.noreply.github.com>

[docs] fix some format issue (#45752)
* fix some error * fix * fix some error * fix bugs * fix some errors * fix * Update transform.py fix some docs errors * Update normal.py fix some doc errors * Update uniform.py fix some docs errors * Update kl.py fix some docs errors * Update math.py fix some docs error * Update math.py fix heaviside links * Update loss.py fix * Update transform.py fix bugs * Update math.py fix * fix some format issue * Update normal.py * fix missing np * order imports * fix some flake8 warning * Update python/paddle/tensor/math.py * fix OP-->API * fix op * fix grid_sample format * trim trailing whitespace * empty commit, test=document_fix * empty commit Co-authored-by: N SigureMo <sigure.qaq@gmail.com> Co-authored-by: N Ligoml <39876205+Ligoml@users.noreply.github.com>
2963e6a0 · Infinity_lee · GitHub · 11002430 · 2963e6a0 · 2963e6a0
13 changed file
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -38,11 +38,11 @@ def kl_divergence(p, q):
        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
    Args:
-        p (Distribution): ``Distribution`` object.
+        p (Distribution): ``Distribution`` object. Inherits from the Distribution Base class.
-        q (Distribution): ``Distribution`` object.
+        q (Distribution): ``Distribution`` object. Inherits from the Distribution Base class.
    Returns:
-        Tensor: Batchwise KL-divergence between distribution p and q.
+        Tensor, Batchwise KL-divergence between distribution p and q.
    Examples:
@@ -71,8 +71,8 @@ def register_kl(cls_p, cls_q):
    implemention funciton by the decorator.
    Args:
-        cls_p(Distribution): Subclass derived from ``Distribution``.
+        cls_p (Distribution): The Distribution type of Instance p. Subclass derived from ``Distribution``.
-        cls_q(Distribution): Subclass derived from ``Distribution``.
+        cls_q (Distribution): The Distribution type of Instance q. Subclass derived from ``Distribution``.
    Examples:
        .. code-block:: python

--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -36,7 +36,7 @@ class Normal(distribution.Distribution):
    .. math::
-        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
+        pdf(x; \mu, \sigma) = \frac{1}{Z}e^{\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
    .. math::
@@ -49,8 +49,8 @@ class Normal(distribution.Distribution):
    * :math:`Z`: is the normalization constant.
    Args:
-        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is float32 and float64.
-        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is float32 and float64.
        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Examples:
@@ -136,7 +136,7 @@ class Normal(distribution.Distribution):
            seed (int): Python integer number.
        Returns:
-          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+            Tensor, A tensor with prepended dimensions shape.The data type is float32.
        """
        if not _non_static_mode():
@@ -177,14 +177,14 @@ class Normal(distribution.Distribution):
        .. math::
-            entropy(\sigma) = 0.5 \\log (2 \pi e \sigma^2)
+            entropy(\sigma) = 0.5 \log (2 \pi e \sigma^2)
        In the above equation:
        * :math:`scale = \sigma`: is the std.
        Returns:
-          Tensor: Shannon entropy of normal distribution.The data type is float32.
+            Tensor, Shannon entropy of normal distribution.The data type is float32.
        """
        name = self.name + '_entropy'
@@ -224,7 +224,7 @@ class Normal(distribution.Distribution):
            value (Tensor): The input tensor.
        Returns:
-          Tensor: probability.The data type is same with value.
+            Tensor, probability. The data type is same with value.
        """
        name = self.name + '_probs'
@@ -243,11 +243,11 @@ class Normal(distribution.Distribution):
        .. math::
-            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\\frac{diff}{\sigma_1})^2 - 1 - 2 \\ln {ratio})
+            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\frac{diff}{\sigma_1})^2 - 1 - 2 \ln {ratio})
        .. math::
-            ratio = \\frac{\sigma_0}{\sigma_1}
+            ratio = \frac{\sigma_0}{\sigma_1}
        .. math::
@@ -266,7 +266,7 @@ class Normal(distribution.Distribution):
            other (Normal): instance of Normal.
        Returns:
-            Tensor: kl-divergence between two normal distributions.The data type is float32.
+            Tensor, kl-divergence between two normal distributions.The data type is float32.
        """
        if not _non_static_mode():

--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -58,7 +58,7 @@ class Transform(object):
    Suppose :math:`X` is a K-dimensional random variable with probability
    density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may
    be defined by transforming :math:`X` with a suitably well-behaved funciton
-    :math:`f`. It suffices for what follows to note that if f is one-to-one and
+    :math:`f`. It suffices for what follows to note that if `f` is one-to-one and
    its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of
    :math:`Y` is
@@ -1001,8 +1001,9 @@ class StackTransform(Transform):
    specific axis.
    Args:
-        transforms(Sequence[Transform]): The sequence of transformations.
+        transforms (Sequence[Transform]): The sequence of transformations.
-        axis(int): The axis along which will be transformed.
+        axis (int, optional): The axis along which will be transformed. default
+            value is 0.
    Examples:
@@ -1010,7 +1011,6 @@ class StackTransform(Transform):
            import paddle
            x = paddle.stack(
                (paddle.to_tensor([1., 2., 3.]), paddle.to_tensor([1, 2., 3.])), 1)
            t = paddle.distribution.StackTransform(
@@ -1023,11 +1023,13 @@ class StackTransform(Transform):
            #        [[2.71828175 , 1.         ],
            #         [7.38905621 , 4.         ],
            #         [20.08553696, 9.         ]])
            print(t.inverse(t.forward(x)))
            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
            #        [[1., 1.],
            #         [2., 2.],
            #         [3., 3.]])
            print(t.forward_log_det_jacobian(x))
            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
            #        [[1.        , 0.69314718],

--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -37,7 +37,7 @@ class Uniform(distribution.Distribution):
    .. math::
-        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
+        pdf(x; a, b) = \frac{1}{Z}, \ a <=x <b
    .. math::
@@ -50,12 +50,14 @@ class Uniform(distribution.Distribution):
    * :math:`Z`: is the normalizing constant.
    The parameters `low` and `high` must be shaped in a way that supports
-    [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
+    :ref:`user_guide_broadcasting` (e.g., `high - low` is a valid operation).
    Args:
-        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of
-        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+            uniform distribution.The data type is float32 and float64.
-        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary
+            of uniform distribution.The data type is float32 and float64.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
    Examples:
        .. code-block:: python
@@ -136,7 +138,7 @@ class Uniform(distribution.Distribution):
            seed (int): Python integer number.
        Returns:
-          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+            Tensor, A tensor with prepended dimensions shape. The data type is float32.
        """
        if not _non_static_mode():
@@ -182,7 +184,7 @@ class Uniform(distribution.Distribution):
            value (Tensor): The input tensor.
        Returns:
-          Tensor: log probability.The data type is same with value.
+            Tensor, log probability.The data type is same with value.
        """
        value = self._check_values_dtype_in_probs(self.low, value)
@@ -219,7 +221,7 @@ class Uniform(distribution.Distribution):
            value (Tensor): The input tensor.
        Returns:
-          Tensor: probability.The data type is same with value.
+            Tensor, probability. The data type is same with value.
        """
        value = self._check_values_dtype_in_probs(self.low, value)
@@ -256,7 +258,7 @@ class Uniform(distribution.Distribution):
            entropy(low, high) = \\log (high - low)
        Returns:
-          Tensor: Shannon entropy of uniform distribution.The data type is float32.
+            Tensor, Shannon entropy of uniform distribution.The data type is float32.
        """
        name = self.name + '_entropy'

--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -44,8 +44,7 @@ def celu(x, alpha=1.0, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
        alpha (float, optional): The 'alpha' value of the CELU formulation. Default is 1.0.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -95,8 +94,7 @@ def elu(x, alpha=1.0, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
        alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -145,6 +143,8 @@ def gelu(x, approximate=False, name=None):
    r"""
    gelu activation.
+    The activation function of Gelu is calculated element by element. More information refers to :ref: `Gaussian Error Linear Units`.
    if approximate is True
    .. math::
@@ -159,9 +159,8 @@ def gelu(x, approximate=False, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        approximate (bool, optional): Wether to enable approximation. Default is False.
+        approximate (bool, optional): Whether to enable approximation. Default is False.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -214,9 +213,8 @@ def hardshrink(x, threshold=0.5, name=None):
    Args:
        x (Tensor): The input Tensor with data type float32, float64.
-        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
+        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -267,8 +265,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
        x (Tensor): The input Tensor with data type float32, float64.
        min (float, optional): The minimum value of the linear region range. Default is -1.
        max (float, optional): The maximum value of the linear region range. Default is 1.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -327,8 +324,7 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
        x (Tensor): The input Tensor with data type float32, float64.
        slope (float, optional): The slope of hardsigmoid function. Default is 0.1666667.
        offset (float, optional): The offset of hardsigmoid function. Default is 0.5.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -385,8 +381,7 @@ def hardswish(x, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -432,8 +427,7 @@ def leaky_relu(x, negative_slope=0.01, name=None):
        x (Tensor): The input Tensor with data type float32, float64.
        negative_slope (float, optional): Slope of the activation function at
            :math:`x < 0` . Default is 0.01.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -479,8 +473,7 @@ def prelu(x, weight, data_format="NCHW", name=None):
        x (Tensor): The input Tensor with data type float32, float64.
        weight (Tensor): The learnable parameter with data type same as ``x``.
            The weight shape is [1] or [in], where `in` is the input channel of ``x``.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
        data_format(str, optional): Data format that specifies the layout of input.
            It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
@@ -607,8 +600,7 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
        lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
        upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
        training (bool, optional): Current mode is in training or others.  Default is True.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -691,8 +683,7 @@ def relu(x, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -742,8 +733,7 @@ def log_sigmoid(x, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -803,8 +793,7 @@ def maxout(x, groups, axis=1, name=None):
            is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
            where D is the dimensions of ``x`` . ``axis`` only supports 1, 3 or -1.
            Default is 1.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type as ``x`` .
@@ -861,8 +850,7 @@ def relu6(x, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -915,8 +903,7 @@ def selu(x,
        x (Tensor): The input Tensor with data type float32, float64.
        scale (float, optional): The value of scale(must be greater than 1.0) for selu. Default is 1.0507009873554804934193349852946
        alpha (float, optional): The value of alpha(must be no less than zero) for selu. Default is 1.6732632423543772848170429916717
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -968,8 +955,7 @@ def silu(x, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1079,8 +1065,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
            :math:`axis + D` . Default is -1.
        dtype (str, optional): The data type of the output tensor, can be float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same shape and data type (use ``dtype`` if it is
@@ -1194,8 +1179,7 @@ def softplus(x, beta=1, threshold=20, name=None):
        x (Tensor): The input Tensor with data type float32, float64.
        beta (float, optional): The value of beta for softplus. Default is 1
        threshold (float, optional): The value of threshold for softplus. Default is 20
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1249,8 +1233,7 @@ def softshrink(x, threshold=0.5, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
        threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1296,8 +1279,7 @@ def softsign(x, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1335,8 +1317,7 @@ def swish(x, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1381,8 +1362,7 @@ def mish(x, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1418,8 +1398,7 @@ def tanhshrink(x, name=None):
    Args:
        x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1466,8 +1445,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
    Parameters:
        x (Tensor): The input Tensor with data type float32, float64.
        threshold (float, optional): The value of threshold for thresholded_relu. Default is 1.0
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -1524,8 +1502,7 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
            preventing data type overflows. Supported dtype: float32, float64.
            If ``dtype`` is None, the output Tensor has the same dtype as x.
            Default is None.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same shape and data type (use ``dtype`` if it is
@@ -1615,8 +1592,7 @@ def glu(x, axis=-1, name=None):
            should be in range [-D, D), where D is the dimensions of ``x`` .
            If ``axis`` < 0, it works the same way as :math:`axis + D` .
            Default is -1.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type as x. The size of the given aixs is
@@ -1678,8 +1654,7 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
            in autograd. Default is False.
        axis (int, optional): The axis along will be calculated softmax value.
            Default is -1.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        Sampled tensor of same shape as ``x`` from the Gumbel-Softmax distribution.

--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -176,6 +176,7 @@ def interpolate(x,
    """
    This API resizes a batch of images.
    The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
    or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
    (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
@@ -184,12 +185,13 @@ def interpolate(x,
    and the resizing only applies on the three dimensions(depth, height and width).
    Supporting resample methods:
-        'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
+    - 'linear' : Linear interpolation
-        'trilinear' : Trilinear interpolation
+    - 'bilinear' : Bilinear interpolation
-        'nearest' : Nearest neighbor interpolation
+    - 'trilinear' : Trilinear interpolation
-        'bicubic' : Bicubic interpolation
+    - 'nearest' : Nearest neighbor interpolation
-        'area': Area interpolation
+    - 'bicubic' : Bicubic interpolation
+    - 'area': Area interpolation
    Linear interpolation is the method of using a line connecting two known quantities
    to determine the value of an unknown quantity between the two known quantities.
@@ -226,13 +228,13 @@ def interpolate(x,
    .. code-block:: text
-        For scale_factor:
+        # For scale_factor:
            if align_corners = True && out_size > 1 :
              scale_factor = (in_size-1.0)/(out_size-1.0)
            else:
              scale_factor = float(in_size/out_size)
-        Linear interpolation:
+        # Linear interpolation:
            if:
                align_corners = False , align_mode = 0
                input : (N,C,W_in)
@@ -243,7 +245,7 @@ def interpolate(x,
                output: (N,C,W_out) where:
                W_out = W_{in} * scale_{factor}
-        Nearest neighbor interpolation:
+        # Nearest neighbor interpolation:
              align_corners = False
              input : (N,C,H_in,W_in)
@@ -251,7 +253,7 @@ def interpolate(x,
              H_out = floor (H_{in} * scale_{factor})
              W_out = floor (W_{in} * scale_{factor})
-        Bilinear interpolation:
+        # Bilinear interpolation:
          if:
              align_corners = False , align_mode = 0
              input : (N,C,H_in,W_in)
@@ -264,7 +266,7 @@ def interpolate(x,
              H_out = H_{in} * scale_{factor}
              W_out = W_{in} * scale_{factor}
-        Bicubic interpolation:
+        # Bicubic interpolation:
          if:
              align_corners = False
              input : (N,C,H_in,W_in)
@@ -277,7 +279,7 @@ def interpolate(x,
              H_out = H_{in} * scale_{factor}
              W_out = W_{in} * scale_{factor}
-        Trilinear interpolation:
+        # Trilinear interpolation:
          if:
              align_corners = False , align_mode = 0
              input : (N,C,D_in,H_in,W_in)
@@ -916,6 +918,7 @@ def dropout(x,
                - train: out = input * mask
                - inference: out = input * (1.0 - dropout_prob)
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -1776,12 +1779,12 @@ def linear(x, weight, bias=None, name=None):
 def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
    r"""
    Label smoothing is a mechanism to regularize the classifier layer and is called
-    label-smoothing regularization (LSR).
+    label-smoothing regularization (LSR).Label smoothing is proposed to encourage
+    the model to be less confident, since optimizing the log-likelihood of the
+    correct label directly may cause overfitting and reduce the ability of the
+    model to adapt.
-    Label smoothing is proposed to encourage the model to be less confident,
+    Label smoothing replaces the ground-truth label :math:`y` with the weighted sum
-    since optimizing the log-likelihood of the correct label directly may
-    cause overfitting and reduce the ability of the model to adapt. Label
-    smoothing replaces the ground-truth label :math:`y` with the weighted sum
    of itself and some fixed distribution :math:`\mu`. For class :math:`k`,
    i.e.

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -854,15 +854,18 @@ def hsigmoid_loss(input,
    """
    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
    and speed up the model training, especially the training of language model.
    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    Comparing to softmax, hsigmoid can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
    represents the number of classes or the size of word dict.
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    The API supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_.
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+    For the custom tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
@@ -1732,9 +1735,7 @@ def margin_cross_entropy(logits,
    .. hint::
        The API supports single GPU and multi GPU, and don't supports CPU.
        For data parallel mode, set ``group=False``.
        For model parallel mode, set ``group=None`` or the group instance return by paddle.distributed.new_group.
        And logits.shape[-1] can be different at each rank.
@@ -1757,12 +1758,12 @@ def margin_cross_entropy(logits,
                    Default value is `'mean'`.
    Returns:
-        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
+        Tensor|tuple[Tensor, Tensor], return the cross entropy loss if
-            `return_softmax` is False, otherwise the tuple \
+            `return_softmax` is False, otherwise the tuple (loss, softmax),
-            (loss, softmax), softmax is shard_softmax when \
+            softmax is shard_softmax when using model parallel, otherwise
-            using model parallel, otherwise softmax is in \
+            softmax is in the same shape with input logits. If
-            the same shape with input logits. If ``reduction == None``, \
+            ``reduction == None``, the shape of loss is ``[N, 1]``, otherwise
-            the shape of loss is ``[N, 1]``, otherwise the shape is ``[1]``.
+            the shape is ``[1]``.
    Examples:

--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -31,7 +31,7 @@ __all__ = []
 def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
    r"""
-    This op normalizes ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
+    Normalize ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
    .. math::
@@ -45,7 +45,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
    Parameters:
        x (Tensor): The input tensor could be N-D tensor, and the input data type could be float32 or float64.
-        p (float|int, optional): The exponent value in the norm formulation. Default: 2
+        p (float|int, optional): The exponent value in the norm formulation. Default: 2.
        axis (int, optional): The axis on which to apply normalization. If `axis < 0`, the dimension to normalization is `x.ndim + axis`. -1 is the last dimension.
        epsilon (float, optional): Small float added to denominator to avoid dividing by zero. Default is 1e-12.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -124,7 +124,7 @@ def grid_sample(x,
                align_corners=True,
                name=None):
    """
-    This operation samples input X by using bilinear interpolation or
+    Sample input X by using bilinear interpolation or
    nearest interpolation based on flow field grid, which is usually
    generated by :code:`affine_grid` . When the input X is 4-D Tensor,
    the grid of shape [N, H, W, 2] is the concatenation of (x, y)
@@ -209,6 +209,7 @@ def grid_sample(x,
                             None by default.
    Returns:
        Tensor, The shape of output is [N, C, grid_H, grid_W] or [N, C, grid_D, grid_H, grid_W] in which `grid_D` is the depth of grid,
                `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.

--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -132,12 +132,12 @@ def make_scheduler(*,
        skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0.
    Returns:
-        A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState.
+        A scheduler function, conforms to above state transform setting. The function will takes one parameter `step_num`, and returns corresponding ProfilerState.
    Examples:
-        1. profiling range [2, 5]
+        1. profiling range [2, 5].
-        Assume batch 0: closed, batch 1: ready, batch [2, 5] record
+        Assume batch 0: closed, batch 1: ready, batch [2, 5] record.
            .. code-block:: python
                :name: code-example1
@@ -146,9 +146,9 @@ def make_scheduler(*,
                profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
-        2. profiling range [3,6], [9,12], [15,18]...
+        2. profiling range [3,6], [9,12], [15,18].
-        Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat.
            .. code-block:: python
                :name: code-example2
@@ -196,12 +196,12 @@ def export_chrome_tracing(dir_name: str,
                          worker_name: Optional[str] = None) -> Callable:
    r"""
    Return a callable, used for outputing tracing data to chrome tracing format file.
-    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
+    The output file will be saved in directory ``dir_name``, and file name will be set as `worker_name`.
-    if worker_name is not set, the default name is [hostname]_[pid].
+    if `worker_name` is not set, the default name is `[hostname]_[pid]`.
    Args:
        dir_name(str): Directory to save profiling data.
-        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is `[hostname]_[pid]`.
    Returns:
        A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
@@ -246,12 +246,12 @@ def export_protobuf(dir_name: str,
                    worker_name: Optional[str] = None) -> Callable:
    r"""
    Return a callable, used for outputing tracing data to protobuf file.
-    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
+    The output file will be saved in directory ``dir_name``, and file name will be set as ``worker_name``.
-    if worker_name is not set, the default name is [hostname]_[pid].
+    if ``worker_name`` is not set, the default name is `[hostname]_[pid]`.
    Args:
        dir_name(str): Directory to save profiling data.
-        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is `[hostname]_[pid]`.
    Returns:
        A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
@@ -317,7 +317,7 @@ class Profiler:
            If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
            which means profiling range [start_batch, end_batch).
        on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing.
-            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/).
+            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>`.
        timer_only (bool, optional): If it is True, the cost of Dataloader and every step of the model will be count without profiling. Otherwise, the model will
            be timed and profiled. Default: False.
        record_shapes (bool, optional): If it is True, collect op's input shape information. Default: False.
@@ -339,7 +339,7 @@ class Profiler:
                        #train()
                        p.step()
-        2. profiling range [2,4], [7, 9], [11,13]
+        2. profiling range [2,4], [7, 9], [11,13].
            .. code-block:: python
                :name: code-example2
@@ -354,7 +354,7 @@ class Profiler:
                        #train()
                        p.step()
-        3. Use profiler without context manager, and use default parameters
+        3. Use profiler without context manager, and use default parameters.
            .. code-block:: python
                :name: code-example3
@@ -369,7 +369,7 @@ class Profiler:
                p.stop()
                p.summary()
-        4. Use profiler to get throughput and cost of the model
+        4. Use profiler to get throughput and cost of the model.
            .. code-block:: python
                :name: code-example-timer1
@@ -399,8 +399,7 @@ class Profiler:
                dataset = RandomDataset(20 * 4)
                simple_net = SimpleNet()
-                opt = paddle.optimizer.SGD(learning_rate=1e-3,
+                opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=simple_net.parameters())
-                                           parameters=simple_net.parameters())
                BATCH_SIZE = 4
                loader = paddle.io.DataLoader(
                    dataset,
@@ -531,7 +530,7 @@ class Profiler:
                prof.stop()
        '''
-        # Timing only without profiling
+        # Timing only without profiling.
        benchmark().begin()
        if not self.timer_only or self.emit_nvtx:
            utils._is_profiler_used = True
@@ -584,7 +583,7 @@ class Profiler:
        if self.profile_memory:
            disable_memory_recorder()
        # self.current_state -> CLOSED
-        # In this situation, RECORD state is regarded as RECORD_AND_RETURN
+        # In this situation, RECORD state is regarded as RECORD_AND_RETURN.
        if self.record_event:
            self.record_event.end()
            self.record_event = None
@@ -607,7 +606,7 @@ class Profiler:
        Args:
            num_samples (int|None, optional): Specifies the batch size of every step of the model
-                that is used to compute throughput when timer_only is True. Default: None.
+                that is used to compute throughput when `timer_only` is True. Default: None.
        Examples:
            .. code-block:: python
@@ -645,7 +644,7 @@ class Profiler:
        r"""
        Get statistics for current step. If the function is called at certain iteration
        intervals, the result is the average of all steps between the previous call and
-        this call. Statistics are as follows：
+        this call. Statistics are as follows:
        1. reader_cost: the cost of loading data measured in seconds.
@@ -751,7 +750,7 @@ class Profiler:
        Args:
            path(str): file path of the output.
-            format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json".
+            format(str, optional): output format, can be chosen from ['json', 'pb'], 'json' for chrome tracing and 'pb' for protobuf, default value is 'json'.
        Examples:

--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -36,8 +36,10 @@ class RecordEvent(ContextDecorator):
    Interface for recording a time range by user defined.
    Args:
-        name(str): Name of the record event
+        name (str): Name of the record event.
-        event_type(TracerEventType, optional): Optional, default value is TracerEventType.PythonUserDefined. It is reserved for internal purpose, and it is better not to specify this parameter.
+        event_type (TracerEventType, optional): Optional, default value is
+            `TracerEventType.PythonUserDefined`. It is reserved for internal
+            purpose, and it is better not to specify this parameter.
    Examples:
        .. code-block:: python
@@ -59,7 +61,7 @@ class RecordEvent(ContextDecorator):
            record_event.end()
    **Note**:
-        RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of RECORD.
+        RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of `RECORD`.
    """
    def __init__(
@@ -134,7 +136,7 @@ def load_profiler_result(filename: str):
        filename(str): Name of the exported protobuf file of profiler data.
    Returns:
-        ProfilerResult object, which stores profiling data.
+        ``ProfilerResult`` object, which stores profiling data.
    Examples:
        .. code-block:: python

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -4119,9 +4119,8 @@ def lerp_(x, y, weight, name=None):
 def erfinv(x, name=None):
    r"""
-    The inverse error function of x.
+    The inverse error function of x. Please refer to :ref:`api_paddle_erf`
-    Equation:
        .. math::
            erfinv(erf(x)) = x.
@@ -4234,7 +4233,6 @@ def deg2rad(x, name=None):
    r"""
    Convert each of the elements of input x from degrees to angles in radians.
-    Equation:
        .. math::
            deg2rad(x)=\pi * x / 180
@@ -4250,7 +4248,6 @@ def deg2rad(x, name=None):
        .. code-block:: python
            import paddle
-            import numpy as np
            x1 = paddle.to_tensor([180.0, -180.0, 360.0, -360.0, 90.0, -90.0])
            result1 = paddle.deg2rad(x1)
@@ -4676,18 +4673,18 @@ def angle(x, name=None):
    return out
 def heaviside(x, y, name=None):
-    """
+    r"""
    Computes the Heaviside step function determined by corresponding element in y for each element in x. The equation is
    .. math::
        heaviside(x, y)=
            \left\{
-                \\begin{array}{lcl}
+                \begin{array}{lcl}
-                0,& &\\text{if} \ x < 0, \\\\
+                0,& &\text{if} \ x < 0, \\
-                y,& &\\text{if} \ x = 0, \\\\
+                y,& &\text{if} \ x = 0, \\
-                1,& &\\text{if} \ x > 0.
+                1,& &\text{if} \ x > 0.
                \end{array}
-            \\right.
+            \right.
    Note:
        ``paddle.heaviside`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.

--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -293,7 +293,7 @@ def CUDAExtension(sources, *args, **kwargs):
        **kwargs(dict[option], optional): Specify other arguments same as ``setuptools.Extension`` .
    Returns:
-        setuptools.Extension: An instance of setuptools.Extension
+        setuptools.Extension: An instance of setuptools.Extension.
    """
    kwargs = normalize_extension_kwargs(kwargs, use_cuda=True)
    # Note(Aurelius84): While using `setup` and `jit`, the Extension `name` will