diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index bfa7bc8e66e22ef9843e555dc2f800c50e159f76..b26a585d5b4a13384059161a40cb13d8901cbef5 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -94,9 +94,9 @@ def decorate(
     Commonly, it is used together with `auto_cast` to achieve Pure float16/bfloat16 in imperative mode.
 
     Args:
-        models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
+        models(Layer|list of Layer): The defined models by user, models must be either a single model or a list of models. Default is None.
         optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
-        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
+        level(str, optional): Auto mixed precision level. Accepted values are 'O1' and 'O2': O1 represent mixed precision, the decorator will do nothing;
              O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
         master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index f9ac515aeb5d762c7004a9dc455361e53956c460..4f56306e5e38e0389bbe529b944326480df461e4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -932,7 +932,8 @@ def xpu_places(device_ids=None):
 
 def npu_places(device_ids=None):
     """
-    **Note**:
+
+    Note:
         For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
 
     This function creates a list of :code:`paddle.NPUPlace` objects.
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 2bf2c4542b6808fcb73293dcdd93cf008ec4b45f..79f2cde1b1b6c2acf68095b05c1e4f53f1025b77 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4174,7 +4174,6 @@ class ModelAverage(Optimizer):
 
 class ExponentialMovingAverage:
     r"""
-        :api_attr: Static Graph
 
     Compute the moving average of parameters with exponential decay.
     Given a parameter :math:`\\theta`, its exponential moving average (EMA)
@@ -4182,9 +4181,9 @@ class ExponentialMovingAverage:
 
     ..  math::
 
-        \\text{EMA}_0 & = 0
+        \text{EMA}_0 & = 0
 
-        \\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t
+        \text{EMA}_t & = \text{decay} * \text{EMA}_{t-1} + (1 - \text{decay}) * \theta_t
 
     The average results calculated by **update()** method will be saved in
     temporary variables which are created and maintained by the object, and can
@@ -4193,12 +4192,12 @@ class ExponentialMovingAverage:
 
     **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be
     zero biased, which can be corrected by divided by a factor
-    :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters
+    :math:`(1 - \text{decay}^t)` , i.e., the actual EMAs applied to parameters
     when calling **apply()** method would be
 
     ..  math::
 
-        \\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t}
+        \widehat{\text{EMA}}_t = \frac{\text{EMA}_t}{1 - \text{decay}^t}
 
     **Decay rate scheduling**. A large decay rate very close to 1 would result
     in that the averages move very slowly. And a better strategy is to set a
@@ -4208,7 +4207,7 @@ class ExponentialMovingAverage:
 
     ..  math::
 
-        \\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}})
+        \min(\text{decay}, \frac{1 + \text{thres_steps}}{10 + \text{thres_steps}})
 
     Usually **thres_steps** can be the global training steps.
 
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse.py b/python/paddle/incubate/operators/softmax_mask_fuse.py
index 672f4ad545e43fad0d4b56ca863d80ee8b851615..399f8e9bd9800da2e3aaa98e6958f3196e03a761 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse.py
@@ -28,7 +28,7 @@ def softmax_mask_fuse(x, mask, name=None):
     .. math::
         out = softmax(x + mask)
 
-    **Note**:
+    Note:
         This API only supports GPU.
 
     Args:
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
index 936b1971513a04c86a0098a1ec67e21ad3bac31f..ffe8d8ac5ada9bbf31cdc623656a6fcd6e728419 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
@@ -29,7 +29,7 @@ def softmax_mask_fuse_upper_triangle(x):
     .. math::
         out = softmax(LowerTriangular(x))
 
-    **Note**:
+    Note:
         This API only supports GPU.
 
     Args:
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index 6ebc08b32d1ffb97ee83a6fc9d04d2aa2cc9936d..20a925c0f3667e4c5cd317e051c1d0b1f0938a64 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -266,18 +266,13 @@ def stft(
     windows of the input using this formula:
 
     .. math::
-        X_t[\omega] = \sum_{n = 0}^{N-1}%
-                      \text{window}[n]\ x[t \times H + n]\ %
-                      e^{-{2 \pi j \omega n}/{N}}
+        X_t[f] = \sum_{n = 0}^{N-1} \text{window}[n]\ x[t \times H + n]\ e^{-{2 \pi j f n}/{N}}
 
     Where:
     - :math:`t`: The :math:`t`-th input window.
-
-    - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
-      or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`.
-
+    - :math:`f`: Frequency :math:`0 \leq f < \text{n_fft}` for `onesided=False`,
+    or :math:`0 \leq f < \lfloor \text{n_fft} / 2 \rfloor + 1` for `onesided=True`.
     - :math:`N`: Value of `n_fft`.
-
     - :math:`H`: Value of `hop_length`.
 
     Args:
@@ -285,11 +280,11 @@ def stft(
             shape `[..., seq_length]`. It can be a real-valued or a complex Tensor.
         n_fft (int): The number of input samples to perform Fourier transform.
         hop_length (int, optional): Number of steps to advance between adjacent windows
-            and `0 < hop_length`. Default: `None`(treated as equal to `n_fft//4`)
-        win_length (int, optional): The size of window. Default: `None`(treated as equal
+            and `0 < hop_length`. Default: `None` (treated as equal to `n_fft//4`)
+        win_length (int, optional): The size of window. Default: `None` (treated as equal
             to `n_fft`)
         window (Tensor, optional): A 1-dimensional tensor of size `win_length`. It will
-            be center padded to length `n_fft` if `win_length < n_fft`. Default: `None`(
+            be center padded to length `n_fft` if `win_length < n_fft`. Default: `None` (
             treated as a rectangle window with value equal to 1 of size `win_length`).
         center (bool, optional): Whether to pad `x` to make that the
             :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
@@ -438,21 +433,20 @@ def istft(
     Inverse short-time Fourier transform (ISTFT).
 
     Reconstruct time-domain signal from the giving complex input and window tensor when
-        nonzero overlap-add (NOLA) condition is met:
+    nonzero overlap-add (NOLA) condition is met:
 
     .. math::
-        \sum_{t = -\infty}^{\infty}%
-            \text{window}^2[n - t \times H]\ \neq \ 0, \ \text{for } all \ n
+        \sum_{t = -\infty}^{\infty} \text{window}^2[n - t \times H]\ \neq \ 0, \ \text{for } all \ n
 
     Where:
     - :math:`t`: The :math:`t`-th input window.
     - :math:`N`: Value of `n_fft`.
     - :math:`H`: Value of `hop_length`.
 
-    Result of `istft` expected to be the inverse of `paddle.signal.stft`, but it is
+        Result of `istft` expected to be the inverse of `paddle.signal.stft`, but it is
         not guaranteed to reconstruct a exactly realizible time-domain signal from a STFT
         complex tensor which has been modified (via masking or otherwise). Therefore, `istft`
-        gives the [Griffin-Lim optimal estimate](https://ieeexplore.ieee.org/document/1164317)
+        gives the `[Griffin-Lim optimal estimate] <https://ieeexplore.ieee.org/document/1164317>`_
         (optimal in a least-squares sense) for the corresponding signal.
 
     Args:
@@ -460,9 +454,9 @@ def istft(
             Tensor with shape `[..., n_fft, num_frames]`.
         n_fft (int): The size of Fourier transform.
         hop_length (int, optional): Number of steps to advance between adjacent windows
-            from time-domain signal and `0 < hop_length < win_length`. Default: `None`(
+            from time-domain signal and `0 < hop_length < win_length`. Default: `None` (
             treated as equal to `n_fft//4`)
-        win_length (int, optional): The size of window. Default: `None`(treated as equal
+        win_length (int, optional): The size of window. Default: `None` (treated as equal
             to `n_fft`)
         window (Tensor, optional): A 1-dimensional tensor of size `win_length`. It will
             be center padded to length `n_fft` if `win_length < n_fft`. It should be a
@@ -470,7 +464,7 @@ def istft(
             a rectangle window with value equal to 1 of size `win_length`).
         center (bool, optional): It means that whether the time-domain signal has been
             center padded. Default: `True`.
-        normalized (bool, optional): Control whether to scale the output by `1/sqrt(n_fft)`.
+        normalized (bool, optional): Control whether to scale the output by :math:`1/sqrt(n_{fft})`.
             Default: `False`
         onesided (bool, optional): It means that whether the input STFT tensor is a half
             of the conjugate symmetry STFT tensor transformed from a real-valued signal
@@ -486,7 +480,7 @@ def istft(
 
     Returns:
         A tensor of least squares estimation of the reconstructed signal(s) with shape
-            `[..., seq_length]`
+        `[..., seq_length]`
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 2a3ae8001e74300221d060fc02992ed3438fac6f..d91b63dc9ad488c82a71423ad9ea1d82d9bb2101 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -3261,7 +3261,7 @@ def eigvalsh(x, UPLO='L', name=None):
     complex Hermitian (conjugate symmetric) or a real symmetric matrix.
 
     Args:
-        x (Tensor): A tensor with shape :math:`[_, M, M]` , The data type of the input Tensor x
+        x (Tensor): A tensor with shape :math:`[*, M, M]` , where * is zero or greater batch dimension. The data type of the input Tensor x
             should be one of float32, float64, complex64, complex128.
         UPLO(str, optional): Lower triangular part of a (‘L’, default) or the upper triangular part (‘U’).
         name(str, optional): The default value is None.  Normally there is no need for user to set this
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 3bbf7831e8472c0cb4f96f508907ff6c8c7e2627..1922bdca30606fe39f299bc50710a4445363c51c 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -878,14 +878,14 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
     If ``high`` is None (the default), the range is [0, ``low``).
 
     Args:
-        x (Tensor): The input tensor which specifies shape. The dtype of ``x``
+        x (Tensor): The input multi-dimensional tensor which specifies shape. The dtype of ``x``
             can be bool, int32, int64, float16, float32, float64.
-        low (int): The lower bound on the range of random values to generate.
+        low (int, optional): The lower bound on the range of random values to generate.
             The ``low`` is included in the range. If ``high`` is None, the
             range is [0, ``low``). Default is 0.
         high (int, optional): The upper bound on the range of random values to
-            generate, the ``high`` is excluded in the range. Default is None
-            (see above for behavior if high = None). Default is None.
+            generate, the ``high`` is excluded in the range. Default is None.
+            If ``high`` is None, the range is [0, ``low``).
         dtype (str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, int32, int64, float16,
             float32, float64. If ``dytpe`` is None, the data type is the