diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 6ba9acf382b7931e0f7d4783657ef408fd56aee4..ee7c2e36f73a1029be09ddfd72fd480490a787da 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -55,7 +55,7 @@ class LegacyPyLayerContext(object):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
 
-        .. note::
+        Note:
             This API should be called at most once, and only inside `forward`.
 
         Args:
@@ -341,7 +341,7 @@ class EagerPyLayerContext(object):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
 
-        .. note::
+        Note:
             This API should be called at most once, and only inside `forward`.
 
         Args:
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index dc80bb9b563a10871ea06eac3d6215811d85e9cb..fa0f194969bc8e19517d9bf5c3c20f7ed001f160 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -196,7 +196,7 @@ def max_memory_allocated(device=None):
     '''
     Return the peak size of gpu memory that is allocated to tensor of the given device.
 
-    .. note::
+    Note:
         The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
         For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
 
@@ -262,7 +262,7 @@ def memory_allocated(device=None):
     '''
     Return the current size of gpu memory that is allocated to tensor of the given device.
 
-    .. note::
+    Note:
         The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
         For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 64b57c1106d0f26126d388f805e702b99ae5816b..45d6b006528117e4940fef9d8bcc580102c19e40 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -1344,7 +1344,7 @@ def alltoall_single(in_tensor,
     """
     Scatter a single input tensor to all participators and gather the received tensors in out_tensor.
 
-    .. note::
+    Note:
         ``alltoall_single`` is only supported in eager mode.
 
     Args:
diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
index 223e3afbd5970fd108772d3f23fbdda6ac896a40..7e81043d319802028d872b65baef66778ceb4b56 100644
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -30,9 +30,9 @@ def wait_server_ready(endpoints):
     ["127.0.0.1:8080", "127.0.0.1:8081"]
 
     Examples:
-    .. code-block:: python
+        .. code-block:: python
 
-         wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
+             wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
     """
     assert not isinstance(endpoints, str)
     while True:
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 6fdbf8627c9f9e26ee7282fd4e1f43268b978d59..8a22526d3c2e50b77800904ae9b536226bc258a7 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -95,7 +95,7 @@ def init_parallel_env():
     """
     Initialize parallel training environment in dynamic graph mode.
 
-    .. note::
+    Note:
         Now initialize both `NCCL` and `GLOO` contexts for communication.
 
     Args:
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 0cd67b84a4acb74721f711bcf48a0e4981c581b8..0ee3341b84607feb59f6e87cdbfc6e2e20a2274b 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -177,7 +177,7 @@ def save_group_sharded_model(model, output, optimizer=None):
     """
     Group sharded encapsulated model and optimizer state saving module.
 
-    .. note::
+    Note:
         If using save_group_sharded_model saves the model. When loading again, you need to set the model or optimizer state before using group_sharded_parallel.
 
     Args:
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index c7695059fc0f5418b656020895063fd134f8a6ff..8978e850bff7db05598f300fdadf8b40e46fa706 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -119,7 +119,7 @@ class Distribution(object):
     def probs(self, value):
         """Probability density/mass function.
 
-        .. note::
+        Note:
 
             This method will be deprecated in the future, please use `prob`
             instead.
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index eef33397a7e971f4e79f20923503e49cb05832cd..b7825c0517204942d022dd228ed8be76bdf37b46 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -575,10 +575,10 @@ def save(obj, path, protocol=4, **configs):
     '''
     Save an object to the specified path.
 
-    .. note::
+    Note:
         Now supports saving ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program.
 
-    .. note::
+    Note:
         Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file,
         there is no need to distinguish multiple saved files by adding a suffix. The argument ``path``
         of ``paddle.save`` will be directly used as the saved file name instead of a prefix.
@@ -792,10 +792,10 @@ def load(path, **configs):
     '''
     Load an object can be used in paddle from specified path.
 
-    .. note::
+    Note:
         Now supports loading ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program.
 
-    .. note::
+    Note:
         In order to use the model parameters saved by paddle more efficiently,
         ``paddle.load`` supports loading ``state_dict`` of Layer from the result of
         other save APIs except ``paddle.save`` , but the argument ``path`` format is
@@ -811,7 +811,7 @@ def load(path, **configs):
         ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a
         directory, such as ``model`` and model is a directory.
 
-    .. note::
+    Note:
         If you load ``state_dict`` from the saved result of static mode API such as
         ``paddle.static.save`` or ``paddle.static.save_inference_model`` ,
         the structured variable name in dynamic mode will cannot be restored.
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index 8806122b5a29be28368f3b2fc62afeef8b685dfd..fa289e259cbc31395835b4000cecc8952f1eaf46 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -22,7 +22,7 @@ from paddle.incubate.autograd import primx, utils
 def forward_grad(outputs, inputs, grad_inputs=None):
     """Forward mode of automatic differentiation.
 
-    .. note::
+    Note:
         **ONLY available in the static mode and primitive operators.**
 
     Args:
@@ -95,7 +95,7 @@ def forward_grad(outputs, inputs, grad_inputs=None):
 def grad(outputs, inputs, grad_outputs=None):
     """Reverse mode of automatic differentiation.
 
-    .. note::
+    Note:
         **ONLY available in the static mode and primitive operators**
 
     Args:
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 28136f56821ed64bdd8b524e67d4608b32433b4a..d102dd2ac61867772e1b3e148a93dbdf274537cd 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -520,7 +520,7 @@ def _lower(block, reverse, blacklist):
 @framework.static_only
 def orig2prim(block=None):
     """
-    .. note::
+    Note:
         **This API is ONLY available in the static mode.**
         **Args block must be None or current block of main program.**
 
@@ -544,7 +544,7 @@ def orig2prim(block=None):
 @framework.static_only
 def prim2orig(block=None, blacklist=None):
     """
-    .. note::
+    Note:
         **ONLY available in the static mode.**
         **Args block must be None or current block of main program.**
 
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index 61221aa0afa7ac58e7c6b596cbd53514341a4228..effb8c1abc646b1e7e4497a6197006420ab097a8 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -35,7 +35,7 @@ prim_option = PrimOption()
 @framework.static_only
 def prim_enabled():
     """
-    .. note::
+    Note:
         **ONLY available in the static mode.**
 
     Shows whether the automatic differentiation mechanism based on
@@ -66,7 +66,7 @@ def prim_enabled():
 @framework.static_only
 def enable_prim():
     """
-    .. note::
+    Note:
         **ONLY available in the static mode.**
 
     Turns ON automatic differentiation mechanism based on automatic
@@ -90,7 +90,7 @@ def enable_prim():
 @framework.static_only
 def disable_prim():
     """
-    .. note::
+    Note:
         **ONLY available in the static mode.**
 
     Turns OFF automatic differentiation mechanism based on automatic
diff --git a/python/paddle/incubate/sparse/creation.py b/python/paddle/incubate/sparse/creation.py
index 4d1081343fca9ceb569186a375f07b055ed48d03..2fe5a560d662623496d827e7ee36380beb63d473 100644
--- a/python/paddle/incubate/sparse/creation.py
+++ b/python/paddle/incubate/sparse/creation.py
@@ -95,12 +95,6 @@ def sparse_coo_tensor(indices,
     Returns:
         Tensor: A Tensor constructed from ``indices`` and ``values`` .
 
-    Raises:
-        TypeError: If the data type of ``values`` is not list, tuple, numpy.ndarray, paddle.Tensor
-        ValueError: If ``values`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]. If the ``indices`` is not a 2-D.
-        TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string.
-
     Examples:
 
     .. code-block:: python
@@ -206,12 +200,6 @@ def sparse_csr_tensor(crows,
     Returns:
         Tensor: A Tensor constructed from ``crows``, ``cols`` and ``values`` .
 
-    Raises:
-        TypeError: If the data type of ``values`` is not list, tuple, numpy.ndarray, paddle.Tensor
-        ValueError: If ``values`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]. If the ``crow``, ``cols`` and ``values`` is not a 2-D.
-        TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string.
-
     Examples:
 
     .. code-block:: python
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 5a39aee304f9a39b1df5e484ac546f8c65c64507..b74471c9a184c517cb72878d264f07815fc7b368 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -248,7 +248,7 @@ def hardshrink(x, threshold=0.5, name=None):
 
 def hardtanh(x, min=-1.0, max=1.0, name=None):
     r"""
-    hardtanh activation
+    hardtanh activation. Calculate the `hardtanh` of input `x`.
 
     .. math::
 
@@ -275,9 +275,8 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            x = paddle.to_tensor([-1.5, 0.3, 2.5])
             out = F.hardtanh(x) # [-1., 0.3, 1.]
     """
 
@@ -304,8 +303,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
 
 def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
     r"""
-    hardsigmoid activation.
-
+    hardsigmoid activation. Calculate the `hardsigmoid` of input `x`.
     A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
     which is much faster than sigmoid.
 
@@ -362,11 +360,9 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
 
 def hardswish(x, name=None):
     r"""
-    hardswish activation
-
-    hardswish is proposed in MobileNetV3, and performs better in computational stability
-    and efficiency compared to swish function. For more details please refer
-    to: https://arxiv.org/pdf/1905.02244.pdf
+    hardswish activation. hardswish is proposed in MobileNetV3, and performs
+    better in computational stability and efficiency compared to swish function.
+    For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
 
     .. math::
 
@@ -412,7 +408,7 @@ def hardswish(x, name=None):
 
 def leaky_relu(x, negative_slope=0.01, name=None):
     r"""
-    leaky_relu activation
+    leaky_relu activation. The calculation formula is:
 
     .. math::
         leaky\_relu(x)=
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 9bcbf93767f9b8fe9139d3c885205015f46aa5ed..c08c89e8939c62424818ede2a292feca0f16b801 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -312,18 +312,6 @@ def conv1d(x,
         A tensor representing the conv1d, whose data type is the
         same with input.
 
-    Raises:
-        ValueError: If the channel dimension of the input is less than or equal to zero.
-        ValueError: If `data_format` is not "NCL" or "NLC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
-            or the element corresponding to the input's channel is not 0.
-        ShapeError: If the input is not 3-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
-        ShapeError: If the number of input channels is not equal to filter's channels * groups.
-        ShapeError: If the number of output channels is not be divided by groups.
-
     Examples:
         .. code-block:: python
 
@@ -565,18 +553,6 @@ def conv2d(x,
     Returns:
         A Tensor representing the conv2d result, whose data type is the same with input.
 
-    Raises:
-        ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If the channel dimension of the input is less than or equal to zero.
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
-            or the element corresponding to the input's channel is not 0.
-        ShapeError: If the input is not 4-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels * groups.
-        ShapeError: If the number of output channels is not be divided by groups.
-
     Examples:
         .. code-block:: python
 
@@ -778,19 +754,6 @@ def conv1d_transpose(x,
         when data_format is `"NCL"` and (num_batches, length, channels) when data_format is
         `"NLC"`.
 
-    Raises:
-        ValueError: If `data_format` is a string, but not "NCL" or "NLC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
-            or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
-        ValueError: If `output_padding` is greater than `stride`.
-        ShapeError: If the input is not 3-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
-        ShapeError: If the number of input channels is not equal to filter's channels.
-        ShapeError: If the size of `output_size` is not equal to that of `stride`.
-
     Examples:
         .. code-block:: python
 
@@ -1062,18 +1025,6 @@ def conv2d_transpose(x,
         out_w) or (num_batches, out_h, out_w, channels). The tensor variable storing
         transposed convolution result.
 
-    Raises:
-        ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
-            or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and kernel_size are None at the same time.
-        ShapeError: If the input is not 4-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels.
-        ShapeError: If the size of `output_size` is not equal to that of `stride`.
-
     Examples:
         .. code-block:: python
 
@@ -1499,18 +1450,6 @@ def conv3d_transpose(x,
         variable storing the transposed convolution result, and if act is not None, the tensor
         variable storing transposed convolution and non-linearity activation result.
 
-    Raises:
-        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
-            or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and kernel_size are None at the same time.
-        ShapeError: If the input is not 5-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels.
-        ShapeError: If the size of `output_size` is not equal to that of `stride`.
-
     Examples:
        .. code-block:: python
 
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index e3c7e96939c1e22152b57d51f6b5bbc4fbca4d98..a7b21fd5ca9697e5a9890b87b8befff128dc08f1 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -350,9 +350,6 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
         out(Tensor): The temporal shifting result is a tensor with the
         same shape and same data type as the input.
 
-    Raises:
-        TypeError: seg_num must be int type.
-
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 6af57cd49df4a71678b00d9acaa9d97f9db2084d..8d0d64c7c616bfda404f4a1d91e6a85da2ce6495 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1197,7 +1197,7 @@ def margin_ranking_loss(input,
 
 def l1_loss(input, label, reduction='mean', name=None):
     r"""
-    This operator computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
+    Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
 
     If `reduction` set to ``'none'``, the loss is:
 
@@ -1228,8 +1228,8 @@ def l1_loss(input, label, reduction='mean', name=None):
 
     Returns:
         Tensor, the L1 Loss of Tensor ``input`` and ``label``.
-            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
-            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+        If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+        If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
 
     Examples:
         .. code-block:: python
@@ -1418,7 +1418,7 @@ def nll_loss(input,
 
 def kl_div(input, label, reduction='mean', name=None):
     r"""
-    This operator calculates the Kullback-Leibler divergence loss
+    Calculate the Kullback-Leibler divergence loss
     between Input(X) and Input(Target). Notes that Input(X) is the
     log-probability and Input(Target) is the probability.
 
@@ -1463,31 +1463,26 @@ def kl_div(input, label, reduction='mean', name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             import paddle.nn.functional as F
 
             shape = (5, 20)
-            input = np.random.uniform(-10, 10, shape).astype('float32')
-            target = np.random.uniform(-10, 10, shape).astype('float32')
+            x = paddle.uniform(shape, min=-10, max=10).astype('float32')
+            target = paddle.uniform(shape, min=-10, max=10).astype('float32')
 
             # 'batchmean' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_tensor(input),
-                                 paddle.to_tensor(target), reduction='batchmean')
+            pred_loss = F.kl_div(x, target, reduction='batchmean')
             # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_tensor(input),
-                                 paddle.to_tensor(target), reduction='mean')
+            pred_loss = F.kl_div(x, target, reduction='mean')
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_tensor(input),
-                                 paddle.to_tensor(target), reduction='sum')
+            pred_loss = F.kl_div(x, target, reduction='sum')
             # shape=[1]
 
             # 'none' reduction, loss shape is same with input shape
-            pred_loss = F.kl_div(paddle.to_tensor(input),
-                                 paddle.to_tensor(target), reduction='none')
+            pred_loss = F.kl_div(x, target, reduction='none')
             # shape=[5, 20]
 
     """
@@ -2935,21 +2930,34 @@ def multi_label_soft_margin_loss(input,
                                  reduction="mean",
                                  name=None):
     r"""
+    Calculate a multi-class multi-classification
+    hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
+    and output :math:`y` (which is a 2D `Tensor` of target class indices).
+    For each sample in the mini-batch:
+
+    .. math::
+        \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
 
-        Parameters:
-            input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
-            label (Tensor): Label tensor, the data type is float32 or float64. The shape of label is the same as the shape of input.
-            weight (Tensor,optional): a manual rescaling weight given to each class.
-                    If given, has to be a Tensor of size C and the data type is float32, float64.
-                    Default is ``'None'`` .
-            reduction (str, optional): Indicate how to average the loss by batch_size,
-                    the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-                    If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
-                    If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-                    If :attr:`reduction` is ``'sum'``, the summed loss is returned.
-                    Default: ``'mean'``
-            name (str, optional): Name for the operation (optional, default is None).
-                    For more information, please refer to :ref:`api_guide_Name`.
+    where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
+    :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
+    :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
+    and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
+    :math:`y` and :math:`x` must have the same size.
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label is the same as the shape of input.
+        weight (Tensor,optional): a manual rescaling weight given to each class.
+                If given, has to be a Tensor of size C and the data type is float32, float64.
+                Default is ``'None'`` .
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+                the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+                If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+                If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+                Default: ``'mean'``
+        name (str, optional): Name for the operation (optional, default is None).
+                For more information, please refer to :ref:`api_guide_Name`.
 
 	Shape:
             input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means number of classes, available dtype is float32, float64. The sum operationoperates over all the elements.
@@ -3011,7 +3019,7 @@ def multi_label_soft_margin_loss(input,
 
 def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
     r"""
-    This operator calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
+    Calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
     This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance as :math:`x`,
     and is typically used for learning nonlinear embeddings or semi-supervised learning.
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index cbc93e0deeaa76c055941b13b63c6d1cd9fc067c..b74e7eba948c4f49ce3fb348fb53fb715db3189b 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -294,11 +294,8 @@ def layer_norm(x,
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 2, 2, 3))
           layer_norm_out = paddle.nn.functional.layer_norm(x, x.shape[1:])
           print(layer_norm_out)
     """
@@ -385,14 +382,14 @@ def instance_norm(x,
 
     Parameters:
         x(Tensor): Input Tensor. It's data type should be float32, float64.
-        running_mean(Tensor): running mean. Default None.
-        running_var(Tensor): running variance. Default None.
+        running_mean(Tensor, optional): running mean. Default None.
+        running_var(Tensor, optional): running variance. Default None.
         weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
         bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
         eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        use_input_stats(bool): Default True.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Default "NCHW".
+        use_input_stats(bool, optional): Default True.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
         name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Returns:
@@ -403,11 +400,8 @@ def instance_norm(x,
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 2, 2, 3))
           instance_norm_out = paddle.nn.functional.instance_norm(x)
 
           print(instance_norm_out)
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index d2ba7468c89dbb64dd033749e8c0548e63f6402e..f3605b0f72bd67068e1b6202bf65e5d851ad1975 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -547,12 +547,6 @@ def max_pool1d(x,
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the input is not a 3-D tensor.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
     Examples:
         .. code-block:: python
 
@@ -1079,11 +1073,6 @@ def max_pool2d(x,
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
     Examples:
         .. code-block:: python
 
@@ -1220,11 +1209,6 @@ def max_pool3d(x,
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
     Examples:
         .. code-block:: python
 
@@ -1652,8 +1636,7 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     Returns:
             Tensor: The output tensor of adaptive pooling result. The data type is same
                       as input tensor.
-    Raises:
-            ValueError: 'output_size' should be an integer.
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index a8622bc50223d00341150639f35a490f1185f76a..0dbab2f64e407a4dda8bdaec3a24eabcef33b4b3 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -215,9 +215,8 @@ class Hardshrink(Layer):
 
 class Hardswish(Layer):
     r"""
-    Hardswish activation
-
-    Hardswish is proposed in MobileNetV3, and performs better in computational stability
+    Hardswish activation. Create a callable object of `Hardswish`. Hardswish
+    is proposed in MobileNetV3, and performs better in computational stability
     and efficiency compared to swish function. For more details please refer
     to: https://arxiv.org/pdf/1905.02244.pdf
 
@@ -307,7 +306,7 @@ class Tanh(Layer):
 
 class Hardtanh(Layer):
     r"""
-    Hardtanh Activation
+    Hardtanh Activation. Create a callable object of `Hardtanh`.
 
     .. math::
 
@@ -659,7 +658,8 @@ class SELU(Layer):
 
 class LeakyReLU(Layer):
     r"""
-    Leaky ReLU Activation.
+    Leaky ReLU Activation. Create a callable object of `LeakyReLU` to calculate
+    the `LeakyReLU` of input `x`.
 
     .. math::
 
@@ -686,10 +686,9 @@ class LeakyReLU(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             m = paddle.nn.LeakyReLU()
-            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            x = paddle.to_tensor([-2.0, 0, 1])
             out = m(x)  # [-0.02, 0., 1.]
     """
 
@@ -748,8 +747,8 @@ class Sigmoid(Layer):
 
 class Hardsigmoid(Layer):
     r"""
-    This interface is used to construct a callable object of the ``Hardsigmoid`` class.
-    This layer calcluate the `hardsigmoid` of input x.
+    ``Hardsigmoid`` Activiation Layers, Construct a callable object of
+    the ``Hardsigmoid`` class. This layer calcluate the `hardsigmoid` of input x.
 
     A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
     which is much faster than sigmoid.
@@ -765,7 +764,6 @@ class Hardsigmoid(Layer):
                 \end{array}
             \right.
 
-
     Parameters:
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -1366,7 +1364,7 @@ class LogSoftmax(Layer):
 
 class Maxout(Layer):
     r"""
-    Maxout Activation.
+    Maxout Activation. Create a callable object of `Maxout`.
 
     Assumed the input shape is (N, Ci, H, W).
     The output shape is (N, Co, H, W).
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 514be1ba3f9877b7c017f4500bf2563e35396eec..0bbcac444eb4634d1f9232f3ec773ebf41c4aac3 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -354,22 +354,6 @@ class Upsample(Layer):
         A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
-    Raises:
-        TypeError: size should be a list or tuple or Tensor.
-        ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
-                    'trilinear', 'bicubic', or 'nearest' currently.
-        ValueError: 'linear' only support 3-D tensor.
-        ValueError: 'bilinear' and 'bicubic'  only support 4-D tensor.
-        ValueError: 'trilinear' only support 5-D tensor.
-        ValueError: 'nearest' only support 4-D or 5-D tensor.
-        ValueError: One of size and scale_factor must not be None.
-        ValueError: size length should be 1 for input 3-D tensor.
-        ValueError: size length should be 2 for input 4-D tensor.
-        ValueError: size length should be 3 for input 5-D tensor.
-        ValueError: scale_factor should be greater than zero.
-        TypeError: align_corners should be a bool value
-        ValueError: align_mode can only be '0' or '1'
-        ValueError: data_format can only be 'NCW', 'NWC', 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 34a3c2cc4e0a2ce5a1e1e7a0447022690b59555b..fd5583d50baae87ac46023bc800a3f684ef1381b 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -274,9 +274,6 @@ class Conv1D(_ConvNd):
         - bias: 1-D tensor with shape: (out_channels)
         - output: 3-D tensor with same shape as input x.
 
-    Raises:
-        None
-
     Examples:
         .. code-block:: python
 
@@ -927,10 +924,6 @@ class Conv3D(_ConvNd):
 
            W_{out}&= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
 
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
     Examples:
 
         .. code-block:: python
@@ -1105,9 +1098,6 @@ class Conv3DTranspose(_ConvNd):
 
            W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel\_size[2] - 1) + 1
 
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
     Examples:
 
        .. code-block:: python
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 5ce3a9ea5c9bdeacbac6243aa570a2e40c2e8f4f..7b1415c1a5018ebf91780cc0e74c42e65ff8b247 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -570,15 +570,11 @@ class MSELoss(Layer):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
-
             mse_loss = paddle.nn.loss.MSELoss()
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([1.5])
+            label = paddle.to_tensor([1.7])
             output = mse_loss(input, label)
             print(output)
             # [0.04000002]
@@ -617,10 +613,10 @@ class MSELoss(Layer):
 
 class L1Loss(Layer):
     r"""
-    This interface is used to construct a callable object of the ``L1Loss`` class.
+    Construct a callable object of the ``L1Loss`` class.
     The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
 
-     If `reduction` set to ``'none'``, the loss is:
+    If `reduction` set to ``'none'``, the loss is:
 
     .. math::
         Out = \lvert input - label\rvert
@@ -656,12 +652,9 @@ class L1Loss(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
-            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
+            label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
             l1_loss = paddle.nn.L1Loss()
             output = l1_loss(input, label)
@@ -900,9 +893,10 @@ class NLLLoss(Layer):
 
 class KLDivLoss(Layer):
     r"""
-    This interface calculates the Kullback-Leibler divergence loss
-    between Input(X) and Input(Target). Notes that Input(X) is the
-    log-probability and Input(Target) is the probability.
+    Generate a callable object of 'KLDivLoss' to calculate the
+    Kullback-Leibler divergence loss between Input(X) and
+    Input(Target). Notes that Input(X) is the log-probability
+    and Input(Target) is the probability.
 
     KL divergence loss is calculated as follows:
 
@@ -930,35 +924,30 @@ class KLDivLoss(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             import paddle.nn as nn
 
             shape = (5, 20)
-            x = np.random.uniform(-10, 10, shape).astype('float32')
-            target = np.random.uniform(-10, 10, shape).astype('float32')
+            x = paddle.uniform(shape, min=-10, max=10).astype('float32')
+            target = paddle.uniform(shape, min=-10, max=10).astype('float32')
 
             # 'batchmean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
-            pred_loss = kldiv_criterion(paddle.to_tensor(x),
-                                        paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(x, target)
             # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
-            pred_loss = kldiv_criterion(paddle.to_tensor(x),
-                                        paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(x, target)
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='sum')
-            pred_loss = kldiv_criterion(paddle.to_tensor(x),
-                                        paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(x, target)
             # shape=[1]
 
             # 'none' reduction, loss shape is same with X shape
             kldiv_criterion = nn.KLDivLoss(reduction='none')
-            pred_loss = kldiv_criterion(paddle.to_tensor(x),
-                                        paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(x, target)
             # shape=[5, 20]
     """
 
@@ -1294,7 +1283,7 @@ class MultiLabelSoftMarginLoss(Layer):
 
 class HingeEmbeddingLoss(Layer):
     r"""
-    This operator calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
+    Create a callable object of `HingeEmbeddingLoss` to calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
     This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance as :math:`x`,
     and is typically used for learning nonlinear embeddings or semi-supervised learning.
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 43dd7004c490b34fc1e8586391f293e2ca5eb3ac..46b4f6adefd6776a5097f369f11c7e9de8ccf6a7 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -110,7 +110,7 @@ class _InstanceNormBase(Layer):
 
 class InstanceNorm1D(_InstanceNormBase):
     r"""
-    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+    Create a callable object of `InstanceNorm1D`. Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCL `[batch, in_channels, length]`
 
@@ -126,8 +126,7 @@ class InstanceNorm1D(_InstanceNormBase):
         \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
         y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    Note:
-        `H` means height of feature map, `W` means width of feature map.
+Where `H` means height of feature map, `W` means width of feature map.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -161,11 +160,8 @@ class InstanceNorm1D(_InstanceNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 2, 3))
           instance_norm = paddle.nn.InstanceNorm1D(2)
           instance_norm_out = instance_norm(x)
 
@@ -181,7 +177,7 @@ class InstanceNorm1D(_InstanceNormBase):
 
 class InstanceNorm2D(_InstanceNormBase):
     r"""
-    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+    Create a callable object of `InstanceNorm2D`. Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
 
@@ -198,8 +194,7 @@ class InstanceNorm2D(_InstanceNormBase):
         \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
         y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    Note:
-        `H` means height of feature map, `W` means width of feature map.
+Where `H` means height of feature map, `W` means width of feature map.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -232,11 +227,8 @@ class InstanceNorm2D(_InstanceNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 2, 2, 3))
           instance_norm = paddle.nn.InstanceNorm2D(2)
           instance_norm_out = instance_norm(x)
 
@@ -251,7 +243,7 @@ class InstanceNorm2D(_InstanceNormBase):
 
 class InstanceNorm3D(_InstanceNormBase):
     r"""
-    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+    Create a callable object of `InstanceNorm3D`. Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
 
@@ -268,8 +260,7 @@ class InstanceNorm3D(_InstanceNormBase):
         \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
         y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    Note:
-        `H` means height of feature map, `W` means width of feature map.
+Where `H` means height of feature map, `W` means width of feature map.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -302,11 +293,8 @@ class InstanceNorm3D(_InstanceNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 2, 2, 2, 3))
           instance_norm = paddle.nn.InstanceNorm3D(2)
           instance_norm_out = instance_norm(x)
 
@@ -464,11 +452,7 @@ class GroupNorm(Layer):
 
 class LayerNorm(Layer):
     r"""
-    :alias_main: paddle.nn.LayerNorm
-	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
-	:old_api: paddle.fluid.dygraph.LayerNorm
-
-    This interface is used to construct a callable object of the ``LayerNorm`` class.
+    Construct a callable object of the ``LayerNorm`` class.
     For more details, refer to code examples.
     It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
     Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
@@ -516,12 +500,9 @@ class LayerNorm(Layer):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
-          layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
+          x = paddle.rand((2, 2, 2, 3))
+          layer_norm = paddle.nn.LayerNorm(x.shape[1:])
           layer_norm_out = layer_norm(x)
 
           print(layer_norm_out)
@@ -760,11 +741,8 @@ class BatchNorm1D(_BatchNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 1, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 1, 3))
           batch_norm = paddle.nn.BatchNorm1D(1)
           batch_norm_out = batch_norm(x)
 
@@ -862,11 +840,8 @@ class BatchNorm2D(_BatchNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 1, 2, 3))
           batch_norm = paddle.nn.BatchNorm2D(1)
           batch_norm_out = batch_norm(x)
 
@@ -950,11 +925,8 @@ class BatchNorm3D(_BatchNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 1, 2, 2, 3))
           batch_norm = paddle.nn.BatchNorm3D(1)
           batch_norm_out = batch_norm(x)
 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 927cbe20ced688d840fa48376ab58a66042483a5..68d5fa553b6d0fb6bc28b6cc22a3e0732a2bb424 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -338,14 +338,6 @@ class MaxPool1D(Layer):
     Returns:
         A callable object of MaxPool1D.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
-
     Shape:
         - x(Tensor): The input tensor of max pool1d operator, which is a 3-D tensor.
           The data type can be float32, float64.
@@ -442,10 +434,6 @@ class MaxPool2D(Layer):
 
     Returns:
         A callable object of MaxPool2D.
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
         - x(Tensor): The input tensor of max pool2d operator, which is a 4-D tensor.
@@ -539,10 +527,6 @@ class MaxPool3D(Layer):
 
     Returns:
         A callable object of MaxPool3D.
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
         - x(Tensor): The input tensor of max pool3d operator, which is a 5-D tensor.
@@ -871,9 +855,6 @@ class AdaptiveMaxPool1D(Layer):
     Returns:
         A callable object of AdaptiveMaxPool1D.
 
-    Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
-
     Shape:
         - x(Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor.
           The data type can be float32, float64.
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 0a89e8a386e9cd0a3f1b563f072613309a203b65..4cde491a15fa8581965a3b0c2e5aed783326584f 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -133,7 +133,7 @@ def spectral_norm(layer,
                   eps=1e-12,
                   dim=None):
     r"""
-    This spectral_norm layer applies spectral normalization to a parameter according to the
+    Applies spectral normalization to a parameter according to the
     following Calculation:
 
     Step 1:
@@ -171,7 +171,7 @@ def spectral_norm(layer,
         dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: None.
 
     Returns:
-        The original layer with the spectral norm hook
+        Layer, the original layer with the spectral norm hook.
 
     Examples:
        .. code-block:: python
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 7219077db91f97ee8cf7ce5d4ad8b6c81202f321..d2b0806712281fc12d9d2d58bcb708b274a2a822 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -164,7 +164,7 @@ class WeightNorm(object):
 
 def weight_norm(layer, name='weight', dim=0):
     r"""
-    This weight_norm layer applies weight normalization to a parameter according to the
+    Applies weight normalization to a parameter according to the
     following formula:
 
     .. math::
@@ -193,11 +193,9 @@ def weight_norm(layer, name='weight', dim=0):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           from paddle.nn import Conv2D
           from paddle.nn.utils import weight_norm
 
-          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
           conv = Conv2D(3, 5, 3)
           wn = weight_norm(conv)
           print(conv.weight_g.shape)
@@ -218,7 +216,7 @@ def remove_weight_norm(layer, name='weight'):
         name(str, optional): Name of the weight parameter. Default: 'weight'.
 
     Returns:
-        Origin layer without weight norm
+        Layer, the origin layer without weight norm
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 43404046bc0124861603f89fb5ac472bc1ea1562..4dc451d6334eabf48d48ce2d415d665cc25e8637 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1559,7 +1559,6 @@ class MultiplicativeDecay(LRScheduler):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
@@ -1847,7 +1846,7 @@ class CyclicLR(LRScheduler):
         verbose: (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-    ``CyclicLR`` instance to schedule learning rate.
+        ``CyclicLR`` instance to schedule learning rate.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index f0ca03a61a695b839e31e8cc0f2ab5b928202006..0b8d7f86fa6f0674149b66aaa0bd54fee5250379 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -71,12 +71,12 @@ class RMSProp(Optimizer):
     Parameters:
         learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
           It can be a float value or a LRScheduler.
-        rho(float): rho is :math:`\rho` in equation, default is 0.95.
-        epsilon(float): :math:`\epsilon` in equation is smoothing term to
+        rho(float, optional): rho is :math:`\rho` in equation, default is 0.95.
+        epsilon(float, optional): :math:`\epsilon` in equation is smoothing term to
           avoid division by zero, default is 1e-6.
-        momentum(float): :math:`\beta` in equation is the momentum term,
+        momentum(float, optional): :math:`\beta` in equation is the momentum term,
           default is 0.0.
-        centered(bool): If True, gradients are normalized by the estimated variance of
+        centered(bool, optional): If True, gradients are normalized by the estimated variance of
           the gradient; if False, by the uncentered second moment. Setting this to
           True may help with training, but is slightly more expensive in terms of
           computation and memory. Defaults to False.
@@ -100,9 +100,6 @@ class RMSProp(Optimizer):
         name (str, optional): This parameter is used by developers to print debugging information.
           For details, please refer to :ref:`api_guide_Name`. Default is None.
 
-    Raises:
-        ValueError: If learning_rate, rho, epsilon, momentum are None.
-
     Examples:
           .. code-block:: python
 
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index a01628e184a03cce3c870e07c6d5f3ea6530de2a..1104ba5a29b311344be48892f5d0b298f76a3703 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -264,9 +264,6 @@ def compose(*readers, **kwargs):
     Returns:
         the new data reader (Reader).
 
-    Raises:
-        ComposeNotAligned: outputs of readers are not aligned. This will not raise if check_alignment is set to False.
-
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index de9e48b3367cc70ec6139e9dcdf6859d926db9e9..c91bf14c93974c0096db7097a7f93ea4d47bcb8a 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -132,11 +132,6 @@ def normalize_program(program, feed_vars, fetch_vars):
     Returns:
         Program: Normalized/Optimized program.
 
-    Raises:
-        TypeError: If `program` is not a Program, an exception is thrown.
-        TypeError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
-        TypeError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
-
     Examples:
         .. code-block:: python
 
@@ -266,10 +261,6 @@ def serialize_program(feed_vars, fetch_vars, **kwargs):
     Returns:
         bytes: serialized program.
 
-    Raises:
-        ValueError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
-        ValueError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
-
     Examples:
         .. code-block:: python
 
@@ -329,10 +320,6 @@ def serialize_persistables(feed_vars, fetch_vars, executor, **kwargs):
     Returns:
         bytes: serialized program.
 
-    Raises:
-        ValueError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
-        ValueError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
-
     Examples:
         .. code-block:: python
 
@@ -477,10 +464,6 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor,
     Returns:
         None
 
-    Raises:
-        ValueError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
-        ValueError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
-
     Examples:
         .. code-block:: python
 
@@ -760,9 +743,6 @@ def load_inference_model(path_prefix, executor, **kwargs):
         ``Variable`` (refer to :ref:`api_guide_Program_en`). It contains variables from which
         we can get inference results.
 
-    Raises:
-        ValueError: If `path_prefix.pdmodel` or `path_prefix.pdiparams`  doesn't exist.
-
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index af2322895f80c80c75b3f39951f57a31bae21b94..baeb39e8cf650637fade9c23cb0f290d5e1b9bb7 100755
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -122,9 +122,6 @@ def fc(x,
     Returns:
         Tensor, its shape is :math:`[batch\_size, *, size]` , and the data type is same with input.
 
-    Raises:
-        ValueError: If dimensions of the input tensor is less than 2.
-
     Examples:
         .. code-block:: python
 
@@ -275,9 +272,7 @@ def deform_conv2d(x,
     Returns:
         Tensor: The tensor storing the deformable convolution \
                   result. A Tensor with type float32, float64.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 6765394469b3722f9f3a167dfeb3a6da31dc005c..b0ef5820a1922ac5eab321f5d77d9aa619ba2c8f 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -260,7 +260,7 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Returns the matrix norm (Frobenius) or vector norm (the 1-norm, the Euclidean
     or 2-norm, and in general the p-norm for p > 0) of a given tensor.
 
-    .. note::
+    Note:
         This norm API is different from `numpy.linalg.norm`.
         This api supports high-order input tensors (rank >= 3), and certain axis need to be pointed out to calculate the norm.
         But `numpy.linalg.norm` only supports 1-D vector or 2-D matrix as input tensor.
@@ -1029,7 +1029,7 @@ def dot(x, y, name=None):
     """
     This operator calculates inner product for vectors.
 
-    .. note::
+    Note:
        Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix
        is the batch dimension, which means that the vectors of multiple batches are dotted.
 
@@ -1361,10 +1361,12 @@ def cholesky(x, upper=False, name=None):
             Its data type should be float32 or float64.
         upper (bool): The flag indicating whether to return upper or lower
             triangular matrices. Default: False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: A Tensor with same shape and data type as `x`. It represents \
-            triangular matrices generated by Cholesky decomposition.
+        Tensor, A Tensor with same shape and data type as `x`. It represents
+        triangular matrices generated by Cholesky decomposition.
 
     Examples:
         .. code-block:: python
@@ -1743,24 +1745,27 @@ def mv(x, vec, name=None):
 def det(x, name=None):
     """
     Calculates determinant value of a square matrix or batches of square matrices.
+
     Args:
-        x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the batch of matrices of size
-                    `(*, n, n)` where `*` is one or more batch dimensions.
+        x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the
+            batch of matrices of size `(*, n, n)` where `*` is one or more
+            batch dimensions.
+
     Returns:
-        y (Tensor):the determinant value of a square matrix or batches of square matrices.
+        Tensor, the determinant value of a square matrix or batches of square matrices.
 
     Examples:
         .. code-block:: python
 
-        import paddle
+            import paddle
 
-        x =  paddle.randn([3,3,3])
+            x =  paddle.randn([3,3,3])
 
-        A = paddle.linalg.det(x)
+            A = paddle.linalg.det(x)
 
-        print(A)
+            print(A)
 
-        # [ 0.02547996,  2.52317095, -6.15900707])
+            # [ 0.02547996,  2.52317095, -6.15900707])
 
 
     """
@@ -1809,18 +1814,18 @@ def slogdet(x, name=None):
         of the absolute value of determinant, respectively.
 
     Examples:
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle
+            import paddle
 
-        x =  paddle.randn([3,3,3])
+            x =  paddle.randn([3,3,3])
 
-        A = paddle.linalg.slogdet(x)
+            A = paddle.linalg.slogdet(x)
 
-        print(A)
+            print(A)
 
-        # [[ 1.        ,  1.        , -1.        ],
-        # [-0.98610914, -0.43010661, -0.10872950]])
+            # [[ 1.        ,  1.        , -1.        ],
+            # [-0.98610914, -0.43010661, -0.10872950]])
 
     """
     if in_dygraph_mode():
@@ -1936,13 +1941,11 @@ def matrix_power(x, n, name=None):
 
     Specifically,
 
-    - If `n > 0`, it returns the matrix or a batch of matrices raised to the power
-    of `n`.
+    - If `n > 0`, it returns the matrix or a batch of matrices raised to the power of `n`.
 
     - If `n = 0`, it returns the identity matrix or a batch of identity matrices.
 
-    - If `n < 0`, it returns the inverse of each matrix (if invertible) raised to
-    the power of `abs(n)`.
+    - If `n < 0`, it returns the inverse of each matrix (if invertible) raised to the power of `abs(n)`.
 
     Args:
         x (Tensor): A square matrix or a batch of square matrices to be raised
@@ -2079,10 +2082,12 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
     Pivoting is done if pivot is set to True.
     P mat can be get by pivots:
-    # ones = eye(rows) #eye matrix of rank rows
-    # for i in range(cols):
-    #     swap(ones[i], ones[pivots[i]])
-    # return ones
+
+    .. code-block:: text
+        ones = eye(rows) #eye matrix of rank rows
+        for i in range(cols):
+            swap(ones[i], ones[pivots[i]])
+        return ones
 
     Args:
 
@@ -2096,15 +2101,15 @@ def lu(x, pivot=True, get_infos=False, name=None):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        factorization (Tensor): LU matrix, the factorization of input X.
+        factorization (Tensor), LU matrix, the factorization of input X.
 
-        pivots (IntTensor): the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the
-                    intermediate transpositions of rows. The final permutation `perm` could be
-                    reconstructed by this, details refer to upper example.
+        pivots (IntTensor), the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the
+        intermediate transpositions of rows. The final permutation `perm` could be
+        reconstructed by this, details refer to upper example.
 
-        infos (IntTensor, optional): if `get_infos` is `True`, this is a tensor of size (∗(N-2))
-                    where non-zero values indicate whether factorization for the matrix or each minibatch
-                    has succeeded or failed.
+        infos (IntTensor, optional), if `get_infos` is `True`, this is a tensor of size (∗(N-2))
+        where non-zero values indicate whether factorization for the matrix or each minibatch
+        has succeeded or failed.
 
 
     Examples:
@@ -2180,9 +2185,11 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     unpack L and U matrix from LU, unpack permutation matrix P from Pivtos .
 
     P mat can be get by pivots:
-    # ones = eye(rows) #eye matrix of rank rows
-    # for i in range(cols):
-    #     swap(ones[i], ones[pivots[i]])
+
+    .. code-block:: text
+        ones = eye(rows) #eye matrix of rank rows
+        for i in range(cols):
+            swap(ones[i], ones[pivots[i]])
 
 
     Args:
@@ -2198,11 +2205,11 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        P (Tensor): Permutation matrix P of lu factorization.
+        P (Tensor), Permutation matrix P of lu factorization.
 
-        L (Tensor): The lower triangular matrix tensor of lu factorization.
+        L (Tensor), The lower triangular matrix tensor of lu factorization.
 
-        U (Tensor): The upper triangular matrix tensor of lu factorization.
+        U (Tensor), The upper triangular matrix tensor of lu factorization.
 
 
     Examples:
@@ -2279,14 +2286,14 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
 
 def eig(x, name=None):
     """
-    This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices.
+    Performs the eigenvalue decomposition of a square matrix or a batch of square matrices.
 
-    .. note::
-        If the matrix is a Hermitian or a real symmetric matrix, please use :ref:`paddle.linalg.eigh` instead, which is much faster.
-        If only eigenvalues is needed, please use :ref:`paddle.linalg.eigvals` instead.
-        If the matrix is of any shape, please use :ref:`paddle.linalg.svd`.
-        This API is only supported on CPU device.
-        The output datatype is always complex for both real and complex input.
+    Note:
+        - If the matrix is a Hermitian or a real symmetric matrix, please use :ref:`paddle.linalg.eigh` instead, which is much faster.
+        - If only eigenvalues is needed, please use :ref:`paddle.linalg.eigvals` instead.
+        - If the matrix is of any shape, please use :ref:`paddle.linalg.svd`.
+        - This API is only supported on CPU device.
+        - The output datatype is always complex for both real and complex input.
 
     Args:
         x (Tensor): A tensor with shape math:`[*, N, N]`, The data type of the x should be one of ``float32``,
@@ -2302,16 +2309,14 @@ def eig(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.device.set_device("cpu")
 
-            x_data = np.array([[1.6707249, 7.2249975, 6.5045543],
+            x = paddle.to_tensor([[1.6707249, 7.2249975, 6.5045543],
                                [9.956216,  8.749598,  6.066444 ],
-                               [4.4251957, 1.7983172, 0.370647 ]]).astype("float32")
-            x = paddle.to_tensor(x_data)
+                               [4.4251957, 1.7983172, 0.370647 ]])
             w, v = paddle.linalg.eig(x)
-            print(w)
+            print(v)
             # Tensor(shape=[3, 3], dtype=complex128, place=CPUPlace, stop_gradient=False,
             #       [[(-0.5061363550800655+0j) , (-0.7971760990842826+0j) ,
             #         (0.18518077798279986+0j)],
@@ -2320,7 +2325,7 @@ def eig(x, name=None):
             #        [(-0.23142567697893396+0j),  (0.4944999840400175+0j) ,
             #         (0.7058765252952796+0j) ]])
 
-            print(v)
+            print(w)
             # Tensor(shape=[3], dtype=complex128, place=CPUPlace, stop_gradient=False,
             #       [ (16.50471283351188+0j)  , (-5.5034820550763515+0j) ,
             #         (-0.21026087843552282+0j)])
@@ -2362,8 +2367,8 @@ def eigvals(x, name=None):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`.
-            The eigenvalues are complex-valued even when `x` is real.
+        Tensor, A tensor containing the unsorted eigenvalues which has the same batch
+        dimensions with `x`. The eigenvalues are complex-valued even when `x` is real.
 
     Examples:
         .. code-block:: python
@@ -2450,23 +2455,18 @@ def multi_dot(x, name=None):
         import numpy as np
 
         # A * B
-        A_data = np.random.random([3, 4]).astype(np.float32)
-        B_data = np.random.random([4, 5]).astype(np.float32)
-        A = paddle.to_tensor(A_data)
-        B = paddle.to_tensor(B_data)
+        A = paddle.rand([3, 4])
+        B = paddle.rand([4, 5])
         out = paddle.linalg.multi_dot([A, B])
-        print(out.numpy().shape)
+        print(out.shape)
         # [3, 5]
 
         # A * B * C
-        A_data = np.random.random([10, 5]).astype(np.float32)
-        B_data = np.random.random([5, 8]).astype(np.float32)
-        C_data = np.random.random([8, 7]).astype(np.float32)
-        A = paddle.to_tensor(A_data)
-        B = paddle.to_tensor(B_data)
-        C = paddle.to_tensor(C_data)
+        A = paddle.rand([10, 5])
+        B = paddle.rand([5, 8])
+        C = paddle.rand([8, 7])
         out = paddle.linalg.multi_dot([A, B, C])
-        print(out.numpy().shape)
+        print(out.shape)
         # [10, 7]
 
     """
@@ -2504,18 +2504,17 @@ def eigh(x, UPLO='L', name=None):
             property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-
-        out_value(Tensor):  A Tensor with shape [*, N] and data type of float32 and float64. The eigenvalues of eigh op.
-        out_vector(Tensor): A Tensor with shape [*, N, N] and data type of float32,float64,complex64 and complex128. The eigenvectors of eigh op.
+        - out_value(Tensor):  A Tensor with shape [*, N] and data type of float32 and float64.
+            The eigenvalues of eigh op.
+        - out_vector(Tensor): A Tensor with shape [*, N, N] and data type of float32,float64,
+            complex64 and complex128. The eigenvectors of eigh op.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x_data = np.array([[1, -2j], [2j, 5]])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([[1, -2j], [2j, 5]])
             out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
             print(out_value)
             #[0.17157288, 5.82842712]
@@ -2924,8 +2923,8 @@ def solve(x, y, name=None):
 
     .. math::
         Out = X^-1 * Y
-    Specifically,
-    - This system of linear equations has one solution if and only if input 'X' is invertible.
+
+    Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.
 
     Args:
         x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
@@ -2940,23 +2939,21 @@ def solve(x, y, name=None):
         Its data type should be the same as that of `x`.
 
     Examples:
-    .. code-block:: python
 
-        # a square system of linear equations:
-        # 2*X0 + X1 = 9
-        # X0 + 2*X1 = 8
+        .. code-block:: python
 
-        import paddle
-        import numpy as np
+            # a square system of linear equations:
+            # 2*X0 + X1 = 9
+            # X0 + 2*X1 = 8
 
-        np_x = np.array([[3, 1],[1, 2]])
-        np_y = np.array([9, 8])
-        x = paddle.to_tensor(np_x, dtype="float64")
-        y = paddle.to_tensor(np_y, dtype="float64")
-        out = paddle.linalg.solve(x, y)
+            import paddle
+
+            x = paddle.to_tensor([[3, 1],[1, 2]], dtype="float64")
+            y = paddle.to_tensor([9, 8], dtype="float64")
+            out = paddle.linalg.solve(x, y)
 
-        print(out)
-        # [2., 3.])
+            print(out)
+            # [2., 3.])
     """
     if in_dygraph_mode():
         return _C_ops.solve(x, y)
@@ -3009,24 +3006,24 @@ def triangular_solve(x,
         Tensor: The solution of the system of equations. Its data type should be the same as that of `x`.
 
     Examples:
-    .. code-block:: python
+        .. code-block:: python
 
-        # a square system of linear equations:
-        # x1 +   x2  +   x3 = 0
-        #      2*x2  +   x3 = -9
-        #               -x3 = 5
+            # a square system of linear equations:
+            # x1 +   x2  +   x3 = 0
+            #      2*x2  +   x3 = -9
+            #               -x3 = 5
 
-        import paddle
-        import numpy as np
+            import paddle
+            import numpy as np
 
-        x = paddle.to_tensor([[1, 1, 1],
-                              [0, 2, 1],
-                              [0, 0,-1]], dtype="float64")
-        y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
-        out = paddle.linalg.triangular_solve(x, y, upper=True)
+            x = paddle.to_tensor([[1, 1, 1],
+                                  [0, 2, 1],
+                                  [0, 0,-1]], dtype="float64")
+            y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+            out = paddle.linalg.triangular_solve(x, y, upper=True)
 
-        print(out)
-        # [7, -2, -5]
+            print(out)
+            # [7, -2, -5]
     """
     if in_dygraph_mode():
         return _C_ops.triangular_solve(x, y, upper, transpose, unitriangular)
@@ -3076,18 +3073,18 @@ def cholesky_solve(x, y, upper=False, name=None):
         Tensor: The solution of the system of equations. Its data type is the same as that of `x`.
 
     Examples:
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle
+            import paddle
 
-        u = paddle.to_tensor([[1, 1, 1],
-                                [0, 2, 1],
-                                [0, 0,-1]], dtype="float64")
-        b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
-        out = paddle.linalg.cholesky_solve(b, u, upper=True)
+            u = paddle.to_tensor([[1, 1, 1],
+                                    [0, 2, 1],
+                                    [0, 0,-1]], dtype="float64")
+            b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+            out = paddle.linalg.cholesky_solve(b, u, upper=True)
 
-        print(out)
-        # [-2.5, -7, 9.5]
+            print(out)
+            # [-2.5, -7, 9.5]
     """
     if in_dygraph_mode():
         return _C_ops.cholesky_solve(x, y, upper)
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index e8b699b1f085f08d7ceca3475b0f9b4436862aac..eb12ae7395329fd7d5a9a0134ccadd27e865fd32 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -91,7 +91,7 @@ def logical_and(x, y, out=None, name=None):
 
         out = x \&\& y
 
-    .. note::
+    Note:
         ``paddle.logical_and`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
 
     Args:
@@ -134,7 +134,7 @@ def logical_or(x, y, out=None, name=None):
 
         out = x || y
 
-    .. note::
+    Note:
         ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
 
     Args:
@@ -179,7 +179,7 @@ def logical_xor(x, y, out=None, name=None):
 
         out = (x || y) \&\& !(x \&\& y)
 
-    .. note::
+    Note:
         ``paddle.logical_xor`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
 
     Args:
@@ -957,13 +957,6 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     Returns:
         Tensor: ${out_comment}.
 
-    Raises:
-        TypeError: The data type of ``x`` must be one of float32, float64.
-        TypeError: The data type of ``y`` must be one of float32, float64.
-        TypeError: The type of ``rtol`` must be float.
-        TypeError: The type of ``atol`` must be float.
-        TypeError: The type of ``equal_nan`` must be bool.
-
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
old mode 100755
new mode 100644
index eda6952fe02d449e2176b46d5c4362fbd3e6131a..e0076c8964ddac6ba28a558ec0c9a6db5beee9cd
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -142,10 +142,6 @@ def slice(input, axes, starts, ends):
     Returns:
         Tensor:  A ``Tensor``. The data type is same as ``input``.
 
-    Raises:
-        TypeError: The type of ``starts`` must be list, tuple or Tensor.
-        TypeError: The type of ``ends`` must be list, tuple or Tensor.
-
     Examples:
         .. code-block:: python
 
@@ -441,9 +437,6 @@ def unstack(x, axis=0, num=None):
     Returns:
         list(Tensor): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64.
 
-    Raises:
-        ValueError: If x.shape[axis] <= 0 or axis is not in range [-D, D).
-
     Examples:
         .. code-block:: python
 
@@ -1132,7 +1125,7 @@ def broadcast_tensors(input, name=None):
     """
     Broadcast a list of tensors following broadcast semantics
 
-    .. note::
+    Note:
         If you want know more about broadcasting, please refer to `Introduction to Tensor`_ .
 
     .. _Introduction to Tensor: ../../guides/beginner/tensor_en.html#chapter5-broadcasting-of-tensor
@@ -1430,10 +1423,6 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
                   axes flattened by indicated start axis and end axis. \
                   A Tensor with data type same as input x.
 
-    Raises:
-        ValueError: If x is not a Tensor.
-        ValueError: If start_axis or stop_axis is illegal.
-
     Examples:
 
         .. code-block:: python
@@ -2119,7 +2108,8 @@ def unique_consecutive(x,
     r"""
     Eliminates all but the first element from every consecutive group of equivalent elements.
 
-    .. note:: This function is different from :func:`paddle.unique` in the sense that this function
+    Note:
+        This function is different from :func:`paddle.unique` in the sense that this function
         only eliminates consecutive duplicate values. This semantics is similar to `std::unique` in C++.
 
     Args:
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 4807b4c7d5920c193808e11b0ea50dca6e112ad1..adaf928e163b865c87955b2d22fe89f8667af373 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -4130,7 +4130,7 @@ def erfinv(x, name=None):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): An N-D Tensor, the shape and data type is the same with input.
+        out (Tensor), an N-D Tensor, the shape and data type is the same with input.
 
     Example:
         .. code-block:: python
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index a14266412c4b382a2f200040b85230779544bff7..2a9cefa0b9e88c2e2ffd28a400385c8ecf2a5c75 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -73,7 +73,7 @@ def setup(**attr):
     If the above conditions are not met, the corresponding warning will be printed, and a fatal error may
     occur because of ABI compatibility.
 
-    .. note::
+    Note:
 
         1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
@@ -230,7 +230,7 @@ def CppExtension(sources, *args, **kwargs):
         )
 
 
-    .. note::
+    Note:
         It is mainly used in ``setup`` and the nama of built shared library keeps same
         as ``name`` argument specified in ``setup`` interface.
 
@@ -282,7 +282,7 @@ def CUDAExtension(sources, *args, **kwargs):
         )
 
 
-    .. note::
+    Note:
         It is mainly used in ``setup`` and the nama of built shared library keeps same
         as ``name`` argument specified in ``setup`` interface.
 
@@ -772,7 +772,7 @@ def load(name,
     ``python setup.py install`` command. The interface contains all compiling and installing
     process underground.
 
-    .. note::
+    Note:
 
         1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 2a17036693f5cf2a77515aa75850965427de62c1..b8a0fa7ee22c13f7b01a5b6c39d2b34ee64978bc 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -150,16 +150,6 @@ def yolo_loss(x,
     Returns:
         Tensor: A 1-D tensor with shape [N], the value of yolov3 loss
 
-    Raises:
-        TypeError: Input x of yolov3_loss must be Tensor
-        TypeError: Input gtbox of yolov3_loss must be Tensor
-        TypeError: Input gtlabel of yolov3_loss must be Tensor
-        TypeError: Input gtscore of yolov3_loss must be None or Tensor
-        TypeError: Attr anchors of yolov3_loss must be list or tuple
-        TypeError: Attr class_num of yolov3_loss must be an integer
-        TypeError: Attr ignore_thresh of yolov3_loss must be a float number
-        TypeError: Attr use_label_smooth of yolov3_loss must be a bool value
-
     Examples:
       .. code-block:: python
 
@@ -347,12 +337,6 @@ def yolo_box(x,
         and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification
         scores of boxes.
 
-    Raises:
-        TypeError: Input x of yolov_box must be Tensor
-        TypeError: Attr anchors of yolo box must be list or tuple
-        TypeError: Attr class_num of yolo box must be an integer
-        TypeError: Attr conf_thresh of yolo box must be a float number
-
     Examples:
 
     .. code-block:: python
@@ -511,9 +495,7 @@ def deform_conv2d(x,
     Returns:
         Tensor: The tensor variable storing the deformable convolution \
                   result. A Tensor with type float32, float64.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
+
     Examples:
         .. code-block:: python