[cherry-pick2.4]en-docs warning&error fix (#48332)

* fixdocs, test=document_fix * fixdocs, test=document_fix

[cherry-pick2.4]en-docs warning&error fix (#48332)
* fixdocs, test=document_fix * fixdocs, test=document_fix
1490aaa9 · ustiniankw · GitHub · 3fa7a736 · 1490aaa9 · 1490aaa9
38 changed file
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -28,12 +28,13 @@ _HYBRID_PARALLEL_GROUP = None
 class ParallelMode(object):
    """
    There are all the parallel modes currently supported:
-    - DATA_PARALLEL: Distribute input data to different devices.
-    - TENSOR_PARALLEL: Shards tensors in the network to different devices.
+        - DATA_PARALLEL: Distribute input data to different devices.
-    - PIPELINE_PARALLEL: Place different layers of the network on different devices.
+        - TENSOR_PARALLEL: Shards tensors in the network to different devices.
-    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states 
+        - PIPELINE_PARALLEL: Place different layers of the network on different devices.
-                         corresponding to the parameters to each device.
+        - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
    Examples:
        .. code-block:: python
@@ -43,6 +44,7 @@ class ParallelMode(object):
            print(parallel_mode.DATA_PARALLEL)  # 0
    """
    DATA_PARALLEL = 0
    TENSOR_PARALLEL = 1
    PIPELINE_PARALLEL = 2
@@ -50,14 +52,16 @@ class ParallelMode(object):
 class CommunicateTopology(object):
+    def __init__(
-    def __init__(self,
+        self,
-                 hybrid_group_names=["data", "pipe", "sharding", "model"],
+        hybrid_group_names=["data", "pipe", "sharding", "model"],
-                 dims=[1, 1, 1, 1]):
+        dims=[1, 1, 1, 1],
+    ):
        self._parallel_names = hybrid_group_names
        self._dims = dims
-        self.coordinate = collections.namedtuple('Coordinate',
+        self.coordinate = collections.namedtuple(
-                                                 self._parallel_names)
+            'Coordinate', self._parallel_names
+        )
        self._world_size = reduce(lambda x, y: x * y, self._dims)
        ranges = [range(d) for d in self._dims]
@@ -65,7 +69,8 @@ class CommunicateTopology(object):
        self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
        self._rank2coord = dict(
-            zip(self._coord2rank.values(), self._coord2rank.keys()))
+            zip(self._coord2rank.values(), self._coord2rank.keys())
+        )
    def get_hybrid_group_names(self):
        return self._parallel_names
@@ -90,7 +95,8 @@ class CommunicateTopology(object):
    def get_axis_list(self, axis_name, index):
        axis = self._parallel_names.index(axis_name)
        ranks = [
-            self._coord2rank[coord] for coord in self._coord2rank.keys()
+            self._coord2rank[coord]
+            for coord in self._coord2rank.keys()
            if coord[axis] == index
        ]
        ranks.sort()
@@ -132,7 +138,6 @@ class CommunicateTopology(object):
 class HybridCommunicateGroup(object):
    def __init__(self, topology):
        self.nranks = paddle.distributed.get_world_size()
        self.global_rank = paddle.distributed.get_rank()
@@ -148,10 +153,16 @@ class HybridCommunicateGroup(object):
        self._sharding_parallel_id = self._get_sharding_parallel_id()
        self.stage_id = self._get_pipe_parallel_id()
-        assert self._check_vaild_topo(
+        assert self._check_vaild_topo(), (
-        ), "Here is an unreasonable topogy setting. world_size: {}, but" \
+            "Here is an unreasonable topogy setting. world_size: {}, but"
-            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(self.nranks,
+            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(
-            self._mp_degree, self._sharding_degree, self._pp_degree, self._dp_degree)
+                self.nranks,
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
        # create comm group for data parallel
        self._dp_group, self._dp_comm_group = self._set_comm_group("data")
@@ -164,26 +175,43 @@ class HybridCommunicateGroup(object):
        # create comm group for sharding parallel
        self._sharding_group, self._sharding_comm_group = self._set_comm_group(
-            "sharding")
+            "sharding"
+        )
        # create global group for check inf_nan / clip global norm
        self._check_group, self._check_comm_group = self._set_check_group(
-            "data")
+            "data"
+        )
        # create p2p group
-        self.is_first_stage = (self.stage_id == 0)
+        self.is_first_stage = self.stage_id == 0
-        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+        self.is_last_stage = self.stage_id == (self._pp_degree - 1)
        # create p2p_groups
        if self._pp_degree > 1:
            self._set_p2p_group()
-        debug_str = "HybridParallelInfo: rank_id: %d, mp_degree: %d, " \
+        debug_str = (
-                    "sharding_degree: %d, pp_degree: %d, dp_degree: %d" % (self.global_rank, self._mp_degree,
+            "HybridParallelInfo: rank_id: %d, mp_degree: %d, "
-                    self._sharding_degree, self._pp_degree, self._dp_degree)
+            "sharding_degree: %d, pp_degree: %d, dp_degree: %d"
-        debug_str += ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s" % (
+            % (
-            self._mp_group, self._sharding_group, self._pp_group,
+                self.global_rank,
-            self._dp_group, self._check_group)
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
+        debug_str += (
+            ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s"
+            % (
+                self._mp_group,
+                self._sharding_group,
+                self._pp_group,
+                self._dp_group,
+                self._check_group,
+            )
+        )
        logger.info(debug_str)
        global _HYBRID_PARALLEL_GROUP
@@ -195,7 +223,12 @@ class HybridCommunicateGroup(object):
        # adding its parallel logic within that parallelism
        # when use sharding alone, it should have its own parallelism for its parallel logic
        # TODO modify 3 others parallel to support sharding
-        if self._mp_degree == 1 and self._pp_degree == 1 and self._dp_degree == 1 and self._sharding_degree > 1:
+        if (
+            self._mp_degree == 1
+            and self._pp_degree == 1
+            and self._dp_degree == 1
+            and self._sharding_degree > 1
+        ):
            return ParallelMode.SHARDING_PARALLEL
        elif self._mp_degree == 1 and self._pp_degree == 1:
            return ParallelMode.DATA_PARALLEL
@@ -206,7 +239,13 @@ class HybridCommunicateGroup(object):
            return ParallelMode.PIPELINE_PARALLEL
    def _check_vaild_topo(self):
-        return self._dp_degree * self._mp_degree * self._pp_degree * self._sharding_degree == self.nranks
+        return (
+            self._dp_degree
+            * self._mp_degree
+            * self._pp_degree
+            * self._sharding_degree
+            == self.nranks
+        )
    def _set_comm_group(self, parallel_method="data"):
        parallel_group = []
@@ -268,14 +307,16 @@ class HybridCommunicateGroup(object):
                    self.prev_rank = prev_rank
                next_group = paddle.distributed.new_group(
-                    ranks=[curr_rank, next_rank])
+                    ranks=[curr_rank, next_rank]
+                )
                if self.global_rank == curr_rank:
                    self.send_next_group = next_group
                elif self.global_rank == next_rank:
                    self.recv_prev_group = next_group
                prev_group = paddle.distributed.new_group(
-                    ranks=[prev_rank, curr_rank])
+                    ranks=[prev_rank, curr_rank]
+                )
                if self.global_rank == curr_rank:
                    self.send_prev_group = prev_group
@@ -339,7 +380,12 @@ class HybridCommunicateGroup(object):
        return self._pp_comm_group
    def get_p2p_groups(self):
-        return self.send_next_group, self.send_prev_group, self.recv_next_group, self.recv_prev_group
+        return (
+            self.send_next_group,
+            self.send_prev_group,
+            self.recv_next_group,
+            self.recv_prev_group,
+        )
    # sharding parallel message:
    def _get_sharding_parallel_id(self):
@@ -363,23 +409,25 @@ class HybridCommunicateGroup(object):
        return self._check_comm_group
    def get_rank_from_stage(self, stage_id, **kwargs):
-        return self._topo.get_rank_from_stage(self.global_rank,
+        return self._topo.get_rank_from_stage(
-                                              pipe=stage_id,
+            self.global_rank, pipe=stage_id, **kwargs
-                                              **kwargs)
+        )
 class _CommunicateGroup(object):
-    """ tmp for static """
+    """tmp for static"""
    def __init__(self):
        global _HYBRID_PARALLEL_GROUP
        _HYBRID_PARALLEL_GROUP = self
        self.groups = dict()
-    def set_comm_group(self, group_name, group_rank, group_size, ring_id,
+    def set_comm_group(
-                       group_ranks):
+        self, group_name, group_rank, group_size, ring_id, group_ranks
-        group = paddle.distributed.collective.Group(group_rank, ring_id,
+    ):
-                                                    group_ranks)
+        group = paddle.distributed.collective.Group(
+            group_rank, ring_id, group_ranks
+        )
        self.groups[group_name] = group
    def get_group(self, group_name):

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -103,6 +103,7 @@ def _check_var_exists(var_name):
 def init_parallel_env():
    """
    Initialize parallel training environment in dynamic graph mode.
    Note:
@@ -118,6 +119,7 @@ def init_parallel_env():
    Examples:
        .. code-block:: python
            # required: gpu
            import paddle
            import paddle.nn as nn
@@ -158,6 +160,7 @@ def init_parallel_env():
            if __name__ == '__main__':
                dist.spawn(train)
    """
    # 0. get env & check world size

--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -23,9 +23,9 @@ from ...log_helper import get_logger
 __all__ = ['add_supported_layer']
-_logger = get_logger(__name__,
+_logger = get_logger(
-                     logging.INFO,
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+)
 def _default_pruning(weight_nparray, m, n, func_name, param_name):
@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
    exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
    if exlude_cond_shape2:
        _logger.warning(
-            '{} is not pruned because the first dimension of {} is smaller than {}'
+            '{} is not pruned because the first dimension of {} is smaller than {}'.format(
-            .format(param_name, shape, m))
+                param_name, shape, m
+            )
+        )
        return weight_pruned_nparray, weight_sparse_mask
    if exlude_cond_shape4:
        _logger.warning(
-            '{} is not pruned because the second dimension of {} is smaller than {}'
+            '{} is not pruned because the second dimension of {} is smaller than {}'.format(
-            .format(param_name, shape, m))
+                param_name, shape, m
+            )
+        )
        return weight_pruned_nparray, weight_sparse_mask
    checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight
    # matrices beforce invoking create_mask. Then we transpose the result mask to make
    # sure its shape to be the same as the input weight.
-    weight_sparse_mask = sparsity.create_mask(weight_nparray.T,
+    weight_sparse_mask = sparsity.create_mask(
-                                              func_name=func_name,
+        weight_nparray.T, func_name=func_name, n=n, m=m
-                                              n=n,
+    ).T
-                                              m=m).T
    weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
-    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+    assert sparsity.check_sparsity(
-                    'Pruning {} weight matrix failure!!!'.format(param_name)
+        weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name
+    ), 'Pruning {} weight matrix failure!!!'.format(param_name)
    return weight_pruned_nparray, weight_sparse_mask
@@ -78,28 +82,35 @@ supported_layers_and_prune_func_map = {}
 def add_supported_layer(layer, pruning_func=None):
    r"""
    Add supported layers and its corresponding pruning function.
    Args:
-        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then 
+        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then
-        it would be turn to string internally. ASP would use this name to match parameter's name and call 
+                             it would be turn to string internally. ASP would use this name to match parameter's name and call
-        its the corresponding pruning function.
+                             its the corresponding pruning function.
        pruning_func (function, optional): a function type which receives five argument (weight_nparray,
-        m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
+                                           m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
-        m, n, and func_name, please see `prune_model` for details.
+                                           m, n, and func_name, please see `prune_model` for details.
    """
    name = None
    if isinstance(layer, str):
        name = layer
    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            type(layer).__name__)
+            type(layer).__name__
+        )
    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            layer.__name__)
+            layer.__name__
+        )
    else:
-        assert "The type of layer should be string of Layer, but got {}!".format(
+        assert (
-            type(layer))
+            "The type of layer should be string of Layer, but got {}!".format(
+                type(layer)
+            )
+        )
    if pruning_func is None:
        pruning_func = _default_pruning
    _supported_layers_and_prune_func_map_lock.acquire()

--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -27,9 +27,16 @@ from itertools import permutations
 import threading
 __all__ = [
-    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'calculate_density',
-    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
+    'check_mask_1d',
-    'MaskAlgo', 'CheckMethod'
+    'get_mask_1d',
+    'check_mask_2d',
+    'get_mask_2d_greedy',
+    'get_mask_2d_best',
+    'create_mask',
+    'check_sparsity',
+    'MaskAlgo',
+    'CheckMethod',
 ]
@@ -76,8 +83,9 @@ class CheckMethod(Enum):
            CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
            # CheckMethod.CHECK_2D
        """
-        assert isinstance(mask_algo, MaskAlgo), \
+        assert isinstance(
-               "mask_algo should be MaskAlgo type"
+            mask_algo, MaskAlgo
+        ), "mask_algo should be MaskAlgo type"
        if mask_algo == MaskAlgo.MASK_1D:
            return CheckMethod.CHECK_1D
        else:
@@ -86,20 +94,25 @@ class CheckMethod(Enum):
 def calculate_density(x):
    r"""
    Return the density of the input tensor.
    Args:
        x (nparray): The input tensor.
    Returns:
-        float: The density of :attr:`x`.
+        float, The density of :attr:`x`.
    Examples:
        .. code-block:: python
-          import paddle
-          import numpy as np
-          x = np.array([[0, 1, 3, 0],
+            import paddle
+            import numpy as np
+            x = np.array([[0, 1, 3, 0],
                        [1, 1, 0, 1]])
-          paddle.incubate.asp.calculate_density(x) # 0.625
+            paddle.incubate.asp.calculate_density(x) # 0.625
    """
    x_flattened = x.flatten()
    return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
@@ -108,7 +121,7 @@ def calculate_density(x):
 def _reshape_1d(mat, m):
    r"""
    Reshape the input 2D matrix to shape (-1, m).
-    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, 
+    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`,
    then this function would pad the remainder with 0 before reshaping.
    .. math::
@@ -126,7 +139,7 @@ def _reshape_1d(mat, m):
    remainder = mat.shape[1] % m
    if mat.shape[1] % m > 0:
        mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
-        mat_padded[:, :mat.shape[1]] = mat
+        mat_padded[:, : mat.shape[1]] = mat
        shape = mat_padded.shape
        return mat_padded.reshape(-1, m), shape
    else:
@@ -136,7 +149,7 @@ def _reshape_1d(mat, m):
 def check_mask_1d(mat, n, m):
    r"""
    Check if every row of the input matrix :attr:`mat` is in 1D `n:m` sparse pattern.
-    This function would pad the second dimension of :attr:`mat` by zero 
+    This function would pad the second dimension of :attr:`mat` by zero
    to be a multiples of :attr:`m` if necessary.
    1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
@@ -179,8 +192,8 @@ def check_mask_1d(mat, n, m):
 def get_mask_1d(mat, n, m):
    r"""
-    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
+    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat`
-    in row-directory. This function would pad the second dimension of :attr:`mat` 
+    in row-directory. This function would pad the second dimension of :attr:`mat`
    by zero to be a multiples of :attr:`m` before mask generation.
    1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m):
        min_order_indices = np.argsort(np.absolute(sub_mat))
        mask_flattern[i, min_order_indices[:n].tolist()] = 0
    mask_flattern = mask_flattern.reshape(shape)
-    mask[:, :] = mask_flattern[:, :mat.shape[1]]
+    mask[:, :] = mask_flattern[:, : mat.shape[1]]
    return mask
@@ -239,12 +252,12 @@ def _reshape_2d(mat, m):
    remainder_0 = mat.shape[0] % m
    remainder_1 = mat.shape[1] % m
-    new_shape = (mat.shape[0] if remainder_0 == 0 \
+    new_shape = (
-                 else mat.shape[0] + (m - remainder_0),
+        mat.shape[0] if remainder_0 == 0 else mat.shape[0] + (m - remainder_0),
-                 mat.shape[1] if remainder_1 == 0 \
+        mat.shape[1] if remainder_1 == 0 else mat.shape[1] + (m - remainder_1),
-                 else mat.shape[1] + (m - remainder_1))
+    )
    mat_padded = np.zeros(new_shape)
-    mat_padded[:mat.shape[0], :mat.shape[1]] = mat
+    mat_padded[: mat.shape[0], : mat.shape[1]] = mat
    mat_flattern = np.empty(new_shape).reshape(-1, m * m)
    curr_idx = 0
@@ -252,9 +265,9 @@ def _reshape_2d(mat, m):
        row_end = row_start + m
        for col_start in range(0, mat_padded.shape[1], m):
            col_end = col_start + m
-            sub_mat = np.squeeze(mat_padded[row_start:row_end, \
+            sub_mat = np.squeeze(
-                                            col_start:col_end] \
+                mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
-                                            .reshape(-1))
+            )
            mat_flattern[curr_idx] = sub_mat
            curr_idx += 1
    return mat_flattern, mat_padded.shape
@@ -263,10 +276,10 @@ def _reshape_2d(mat, m):
 def check_mask_2d(mat, n, m):
    r"""
    Check if every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern.
-    This function would pad each dimension of :attr:`mat` by zero to be a multiples of 
+    This function would pad each dimension of :attr:`mat` by zero to be a multiples of
    :attr:`m` if necessary.
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
    under the constraint of at least :attr:`n` zeros for each row and column.
    Args:
@@ -304,18 +317,19 @@ def check_mask_2d(mat, n, m):
    mat_padded, shape = _reshape_2d(mat, m)
    for sub_mat in mat_padded:
        sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
-        if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
+        if (np.sum(np.sum(sub_mask, axis=1) > (m - n)) != 0) and (
-            (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
+            np.sum(np.sum(sub_mask, axis=0) > (m - n)) != 0
+        ):
            return False
    return True
 def get_mask_2d_greedy(mat, n, m):
    r"""
-    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`. 
+    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`.
    This function would pad each dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
    under the constraint of at least :attr:`n` zeros for each row and column.
    Greedily generating: For each :math:`m \times m` block, selecting values to keep in descent order.
@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m):
        sub_mask = np.squeeze(mask_padded[idx])
        min_order_1d_indices = np.argsort(sub_mat)
-        min_order_2d_indices = [(int(x / m), x % m)
+        min_order_2d_indices = [
-                                for x in min_order_1d_indices]
+            (int(x / m), x % m) for x in min_order_1d_indices
+        ]
        row_counter = collections.Counter()
        col_counter = collections.Counter()
        for i in range(len(min_order_1d_indices) - 1, -1, -1):
            matrix_entry = min_order_2d_indices[i]
-            if (row_counter[matrix_entry[0]] == n) or \
+            if (row_counter[matrix_entry[0]] == n) or (
-               (col_counter[matrix_entry[1]] == n):
+                col_counter[matrix_entry[1]] == n
+            ):
                continue
            sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m):
            col_end = col_start + m
            mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
            curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 _valid_2d_patterns_lock = threading.Lock()
@@ -384,7 +400,7 @@ def _compute_valid_2d_patterns(n, m):
    r"""
    Compute all vaild 2D `n:m` sparse patterns.
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
    under the constraint of at least :attr:`n` zeros for each row and column.
    Args:
@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m):
        patterns = patterns + patterns
        patterns = np.asarray(list(set(permutations(patterns, m))))
-        valid = ((patterns.sum(axis=1) <= n).sum(
+        valid = (
-            axis=1) == m).nonzero()[0].reshape(-1)
+            ((patterns.sum(axis=1) <= n).sum(axis=1) == m)
+            .nonzero()[0]
+            .reshape(-1)
+        )
        valid_patterns = np.empty((valid.shape[0], m, m))
        valid_patterns[:] = patterns[valid[:]]
@@ -420,11 +439,11 @@ def _compute_valid_2d_patterns(n, m):
 def get_mask_2d_best(mat, n, m):
    r"""
-    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
+    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`
-    to form sparse matrix with maximun L1 norm .This function would pad each 
+    to form sparse matrix with maximun L1 norm .This function would pad each
    dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
    under the constraint of at least :attr:`n` zeros for each row and column.
    *Note*: L1 norm of sparse matrix from `Best` API is greater than or equal to the one from `Greedy`.
@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m):
    mat_flattern, shape = _reshape_2d(mat, m)
    mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
-    pmax = np.argmax(np.matmul(mat_flattern,
+    pmax = np.argmax(
-                               patterns.reshape(patterns.shape[0], m * m).T),
+        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
-                     axis=1)
+        axis=1,
+    )
    mask_flattern[:] = patterns[pmax[:]]
    mask = np.empty(shape)
@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m):
            col_end = col_start + m
            mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
            curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
    dtype = tensor.dtype
    t = tensor.astype(float)
-    assert isinstance(func_name, MaskAlgo), \
+    assert isinstance(func_name, MaskAlgo), (
-           "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
+        "func_name argumet of create_mask is only accepted as type MaskAlgo. "
-           "But got {}".format(type(func_name))
+        "But got {}".format(type(func_name))
+    )
    func = getattr(sys.modules[__name__], func_name.value, None)
    if len(shape) == 1:
        t = t.reshape(1, shape[0])
@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
        t = t.reshape(shape[0] * shape[1], shape[2])
    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
    elif len(shape) == 4:
-        t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3],
+        t = t.transpose([0, 1, 3, 2]).reshape(
-                                              shape[2])
+            shape[0] * shape[1] * shape[3], shape[2]
+        )
        mask = func(t, n=n, m=m)
-        return mask.reshape([shape[0], shape[1], shape[3],
+        return (
-                             shape[2]]).transpose([0, 1, 3, 2]).astype(dtype)
+            mask.reshape([shape[0], shape[1], shape[3], shape[2]])
+            .transpose([0, 1, 3, 2])
+            .astype(dtype)
+        )
    else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+        raise ValueError(
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
    mask = func(t, n=n, m=m)
    return mask.reshape(shape).astype(dtype)
@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
    shape = tensor.shape
    t = tensor.astype(float)
-    assert type(func_name) == CheckMethod, \
+    assert type(func_name) == CheckMethod, (
-           "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
+        "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
-           "But got {}".format(type(func_name))
+        "But got {}".format(type(func_name))
+    )
    func = getattr(sys.modules[__name__], func_name.value, None)
    if len(shape) == 1:
        t = t.reshape(1, shape[0])
@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
        t = t.reshape(shape[0] * shape[1], shape[2])
    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
    elif len(shape) == 4:
-        t = t.transpose([0, 1, 3,
+        t = t.transpose([0, 1, 3, 2]).reshape(
-                         2]).reshape([shape[0] * shape[1] * shape[3], shape[2]])
+            [shape[0] * shape[1] * shape[3], shape[2]]
+        )
    else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+        raise ValueError(
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
    return func(t, n=n, m=m)
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1352,12 +1352,13 @@ class ParameterMetaClass(VariableMetaClass):
 @six.add_metaclass(VariableMetaClass)
 class Variable(object):
    """
-    **Notes**:
-        **The constructor of Variable should not be invoked directly.**
-        **In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.**
+    Notes:
+        The constructor of Variable should not be invoked directly.
+        In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.
-        **In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data**
+        In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data.
    In Fluid, every input and output of an OP is a variable. In most
    cases, variables are used for holding different kinds of data or training
@@ -1514,12 +1515,13 @@ class Variable(object):
    def detach(self):
        """
        Returns a new Variable, detached from the current graph.
        It will share data with origin Variable and without tensor copy.
        In addition, the detached Variable doesn't provide gradient propagation.
        Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable.
        Examples:
            .. code-block:: python
@@ -1533,6 +1535,7 @@ class Variable(object):
                # create a detached Variable
                y = x.detach()
        """
        assert (
@@ -2085,6 +2088,7 @@ class Variable(object):
    @property
    def T(self):
        """
        Permute current Variable with its dimensions reversed.
        If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
@@ -2103,6 +2107,7 @@ class Variable(object):
                x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
                print(x_T_np.shape)
                # (5, 3, 2)
        """
        if len(self.shape) == 1:
            return self
@@ -2141,7 +2146,7 @@ class Variable(object):
        as ``out = assign(tensor)`` .
        Returns:
-            Variable: The cloned Variable.
+            Variable, The cloned Variable.
        Examples:
            .. code-block:: python
@@ -2171,6 +2176,7 @@ class Variable(object):
    def _set_error_clip(self, error_clip):
        """
        Set the error_clip.
        Args:
@@ -2178,11 +2184,13 @@ class Variable(object):
        Returns:
            None
        """
        self.error_clip = error_clip
    def _set_info(self, key, value):
        """
        Set key-value information for this variable.
        Args:
@@ -2191,6 +2199,7 @@ class Variable(object):
        Returns:
            None
        """
        if not hasattr(self, "_info"):
            self._info = {}
@@ -2198,6 +2207,7 @@ class Variable(object):
    def _get_info(self, key):
        """
        Get the information of this variable corresponding to key.
        Args:
@@ -2205,6 +2215,7 @@ class Variable(object):
        Returns:
            object
        """
        if hasattr(self, "_info") and key in self._info:
            return self._info[key]
@@ -2212,7 +2223,9 @@ class Variable(object):
    def _slice_indices(self, slice, length):
        """
        Reference implementation for the slice.indices method.
        """
        # Compute step and length as integers.
        step = 1 if slice.step is None else slice.step
@@ -2383,7 +2396,7 @@ class Variable(object):
                Default: None
        Returns:
-            Tensor: the value in given scope.
+            Tensor, the value in given scope.
        Examples:
            .. code-block:: python
@@ -2438,6 +2451,7 @@ class Variable(object):
    def set_value(self, value, scope=None):
        '''
        Set the value to the tensor in given scope.
        Args:
@@ -2477,6 +2491,7 @@ class Variable(object):
                    if var.persistable:
                        t_load = paddle.load(path+var.name+'.pdtensor')
                        var.set_value(t_load)
        '''
        # The 'framework' is a low-level module, and 'executor'
@@ -2547,10 +2562,11 @@ class Variable(object):
    def size(self):
        """
        Returns the number of elements for current Variable, which is a int64 Variable with shape [1]
        Returns:
-            Variable: the number of elements for current Variable
+            Variable, the number of elements for current Variable
        Examples:
            .. code-block:: python
@@ -2564,6 +2580,7 @@ class Variable(object):
                # get the number of elements of the Variable
                y = x.size()
        """
        output = self.block.create_var(
@@ -2578,23 +2595,27 @@ class Variable(object):
    def _set_attr(self, name, val):
        """
        Set the value of attribute by attribute's name.
        Args:
            name(str): the attribute name.
            val(int|str|list): the value of the attribute.
        """
        self._update_desc_attr(name, val)
    def _has_attr(self, name):
        """
        Whether this Variable has the attribute with the name `name` or not.
        Args:
            name(str): the attribute name.
        Returns:
-            bool: True if has this attribute.
+            bool, True if has this attribute.
        """
        return self.desc.has_attr(name)
@@ -2624,7 +2645,7 @@ class Variable(object):
            name(str): the attribute name.
        Returns:
-            int|str|list: The attribute value. The return value
+            int|str|list, The attribute value. The return value
            can be any valid attribute type.
        """
        return self.desc.attr(name)
@@ -3196,14 +3217,16 @@ class Operator(object):
    def input(self, name):
        r"""
        Get the input arguments according to the input parameter name.
        Args:
            name(str): The input parameter name.
        Returns:
-            list: return the list of argument names that associated with \
+            list, return the list of argument names that associated with \
                the specific parameter name.
        """
        return self.desc.input(name)

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,13 @@ from __future__ import print_function
 import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
-from ..framework import Variable, _non_static_mode, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Variable,
+    _non_static_mode,
+    _varbase_creator,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from .. import core
 from ..param_attr import ParamAttr
 from . import nn
@@ -33,22 +39,29 @@ __all__ = ['accuracy', 'auc']
 def accuracy(input, label, k=1, correct=None, total=None):
    """
    accuracy layer.
    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
    This function computes the accuracy using the input and label.
    If the correct label occurs in top k predictions, then correct will increment by one.
-    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+    Note:
+        the dtype of accuracy is determined by input. the input and label dtype can be different.
    Args:
        input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
            The shape is ``[sample_number, class_dim]`` .
        label(Tensor): The label of dataset.  Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
-        k(int): The top k predictions for each class will be checked. Data type is int64 or int32.
+        k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1.
-        correct(Tensor): The correct predictions count. A Tensor with type int64 or int32.
+        correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None.
-        total(Tensor): The total entries count. A tensor with type int64 or int32.
+        total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None.
    Returns:
-        Tensor: The correct rate. A Tensor with type float32.
+        Tensor, The correct rate. A Tensor with type float32.
    Examples:
        .. code-block:: python
            import numpy as np
            import paddle
            import paddle.static as static
@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
                        fetch_list=[result[0]])
            print(output)
            #[array([0.], dtype=float32)]
    """
    if _non_static_mode():
        if correct is None:
@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None):
            total = _varbase_creator(dtype="int32")
        _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k,
+        topk_out, topk_indices = _legacy_C_ops.top_k_v2(
-                                                        'sorted', False)
+            input, 'k', _k, 'sorted', False
-        _acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
+        )
-                                            correct, total)
+        _acc, _, _ = _legacy_C_ops.accuracy(
+            topk_out, topk_indices, label, correct, total
+        )
        return _acc
    helper = LayerHelper("accuracy", **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
+    check_variable_and_dtype(
-                             'accuracy')
+        input, 'input', ['float16', 'float32', 'float64'], 'accuracy'
+    )
    topk_out = helper.create_variable_for_type_inference(dtype=input.dtype)
    topk_indices = helper.create_variable_for_type_inference(dtype="int64")
    inputs = {"X": [input]}
@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None):
    else:
        attrs = {'k': k}
    attrs['sorted'] = False
-    helper.append_op(type="top_k_v2",
+    helper.append_op(
-                     inputs=inputs,
+        type="top_k_v2",
-                     attrs=attrs,
+        inputs=inputs,
-                     outputs={
+        attrs=attrs,
-                         "Out": [topk_out],
+        outputs={"Out": [topk_out], "Indices": [topk_indices]},
-                         "Indices": [topk_indices]
+    )
-                     })
    acc_out = helper.create_variable_for_type_inference(dtype="float32")
    if correct is None:
        correct = helper.create_variable_for_type_inference(dtype="int32")
    if total is None:
        total = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(type="accuracy",
+    helper.append_op(
-                     inputs={
+        type="accuracy",
-                         "Out": [topk_out],
+        inputs={"Out": [topk_out], "Indices": [topk_indices], "Label": [label]},
-                         "Indices": [topk_indices],
+        outputs={
-                         "Label": [label]
+            "Accuracy": [acc_out],
-                     },
+            "Correct": [correct],
-                     outputs={
+            "Total": [total],
-                         "Accuracy": [acc_out],
+        },
-                         "Correct": [correct],
+    )
-                         "Total": [total],
-                     })
    return acc_out
-def auc(input,
+def auc(
-        label,
+    input,
-        curve='ROC',
+    label,
-        num_thresholds=2**12 - 1,
+    curve='ROC',
-        topk=1,
+    num_thresholds=2**12 - 1,
-        slide_steps=1,
+    topk=1,
-        ins_tag_weight=None):
+    slide_steps=1,
+    ins_tag_weight=None,
+):
    """
    **Area Under the Curve (AUC) Layer**
@@ -216,13 +232,14 @@ def auc(input,
    helper = LayerHelper("auc", **locals())
    if ins_tag_weight is None:
-        ins_tag_weight = tensor.fill_constant(shape=[1, 1],
+        ins_tag_weight = tensor.fill_constant(
-                                              dtype="float32",
+            shape=[1, 1], dtype="float32", value=1.0
-                                              value=1.0)
+        )
    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc')
    check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc')
-    check_variable_and_dtype(ins_tag_weight, 'ins_tag_weight',
+    check_variable_and_dtype(
-                             ['float32', 'float64'], 'auc')
+        ins_tag_weight, 'ins_tag_weight', ['float32', 'float64'], 'auc'
+    )
    auc_out = helper.create_variable_for_type_inference(dtype="float64")
    batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
@@ -236,62 +253,71 @@ def auc(input,
    batch_stat_pos = helper.create_global_variable(
        persistable=True,
        dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
    batch_stat_neg = helper.create_global_variable(
        persistable=True,
        dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
    # for global auc
    # Needn't maintain the batch id
-    stat_pos = helper.create_global_variable(persistable=True,
+    stat_pos = helper.create_global_variable(
-                                             dtype='int64',
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
-                                             shape=[1, num_thresholds + 1])
+    )
-    stat_neg = helper.create_global_variable(persistable=True,
+    stat_neg = helper.create_global_variable(
-                                             dtype='int64',
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
-                                             shape=[1, num_thresholds + 1])
+    )
    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
-        helper.set_variable_initializer(var, Constant(value=0.0,
+        helper.set_variable_initializer(
-                                                      force_cpu=False))
+            var, Constant(value=0.0, force_cpu=False)
+        )
-    #"InsTagWeight": [ins_tag_weight]
+    # "InsTagWeight": [ins_tag_weight]
    # Batch AUC
-    helper.append_op(type="auc",
+    helper.append_op(
-                     inputs={
+        type="auc",
-                         "Predict": [input],
+        inputs={
-                         "Label": [label],
+            "Predict": [input],
-                         "StatPos": [batch_stat_pos],
+            "Label": [label],
-                         "StatNeg": [batch_stat_neg]
+            "StatPos": [batch_stat_pos],
-                     },
+            "StatNeg": [batch_stat_neg],
-                     attrs={
+        },
-                         "curve": curve,
+        attrs={
-                         "num_thresholds": num_thresholds,
+            "curve": curve,
-                         "slide_steps": slide_steps
+            "num_thresholds": num_thresholds,
-                     },
+            "slide_steps": slide_steps,
-                     outputs={
+        },
-                         "AUC": [batch_auc_out],
+        outputs={
-                         "StatPosOut": [batch_stat_pos],
+            "AUC": [batch_auc_out],
-                         "StatNegOut": [batch_stat_neg]
+            "StatPosOut": [batch_stat_pos],
-                     })
+            "StatNegOut": [batch_stat_neg],
+        },
+    )
    # Global AUC
-    helper.append_op(type="auc",
+    helper.append_op(
-                     inputs={
+        type="auc",
-                         "Predict": [input],
+        inputs={
-                         "Label": [label],
+            "Predict": [input],
-                         "StatPos": [stat_pos],
+            "Label": [label],
-                         "StatNeg": [stat_neg]
+            "StatPos": [stat_pos],
-                     },
+            "StatNeg": [stat_neg],
-                     attrs={
+        },
-                         "curve": curve,
+        attrs={
-                         "num_thresholds": num_thresholds,
+            "curve": curve,
-                         "slide_steps": 0
+            "num_thresholds": num_thresholds,
-                     },
+            "slide_steps": 0,
-                     outputs={
+        },
-                         "AUC": [auc_out],
+        outputs={
-                         "StatPosOut": [stat_pos],
+            "AUC": [auc_out],
-                         "StatNegOut": [stat_neg]
+            "StatPosOut": [stat_pos],
-                     })
+            "StatNegOut": [stat_neg],
-    return auc_out, batch_auc_out, [
+        },
-        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
+    )
-    ]
+    return (
+        auc_out,
+        batch_auc_out,
+        [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg],
+    )
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -241,13 +241,13 @@ def send_ue_recv(
        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
                            The available data type is int32, int64.
-        message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
+        message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
-        reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
+        reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                         Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
+        out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
                                    out_size is smaller or equal to 0, then this input will not be used.
                                    Otherwise, `out_size` should be equal with or larger than
-                                    max(dst_index) + 1.
+                                    max(dst_index) + 1. Default value is `None`.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -32,6 +32,7 @@ def sample_neighbors(
    name=None,
 ):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -52,16 +53,16 @@ def sample_neighbors(
                         The data type should be the same with `row`.
        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                              data type should be the same with `row`.
-        sample_size (int): The number of neighbors we need to sample. Default value is -1,
+        sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
                           which means returning all the neighbors of the input nodes.
-        eids (Tensor): The eid information of the input graph. If return_eids is True,
+        eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
                            then `eids` should not be None. The data type should be the
                            same with `row`. Default is None.
-        return_eids (bool): Whether to return eid information of sample edges. Default is False.
+        return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
-        perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
+        perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
                              is True, then `perm_buffer` should not be None. The data type should
                              be the same with `row`. If not None, we will use fiser-yates sampling
-                              to speed up. Only useful for gpu version.
+                              to speed up. Only useful for gpu version. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -69,15 +70,16 @@ def sample_neighbors(
        - out_neighbors (Tensor), the sample neighbors of the input nodes.
        - out_count (Tensor), the number of sampling neighbors of each input node, and the shape
-                              should be the same with `input_nodes`.
+          should be the same with `input_nodes`.
        - out_eids (Tensor), if `return_eids` is True, we will return the eid information of the
-                             sample edges.
+          sample edges.
    Examples:
        .. code-block:: python
            import paddle
            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
--- a/python/paddle/sparse/nn/layer/activation.py
+++ b/python/paddle/sparse/nn/layer/activation.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1180,7 +1180,8 @@ def triu(x, diagonal=0, name=None):
 def meshgrid(*args, **kwargs):
    """
-    Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
+    Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.
    Args:
        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),

--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py