[cherry-pick2.4]en-docs warning&error fix (#48332)

* fixdocs, test=document_fix * fixdocs, test=document_fix

[cherry-pick2.4]en-docs warning&error fix (#48332)
* fixdocs, test=document_fix * fixdocs, test=document_fix
1490aaa9 · ustiniankw · GitHub · 3fa7a736 · 1490aaa9 · 1490aaa9
38 changed file
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -28,12 +28,13 @@ _HYBRID_PARALLEL_GROUP = None
 class ParallelMode(object):
    """
    There are all the parallel modes currently supported:
        - DATA_PARALLEL: Distribute input data to different devices.
        - TENSOR_PARALLEL: Shards tensors in the network to different devices.
        - PIPELINE_PARALLEL: Place different layers of the network on different devices.
-    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states 
+        - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
-                         corresponding to the parameters to each device.
    Examples:
        .. code-block:: python
@@ -43,6 +44,7 @@ class ParallelMode(object):
            print(parallel_mode.DATA_PARALLEL)  # 0
    """
    DATA_PARALLEL = 0
    TENSOR_PARALLEL = 1
    PIPELINE_PARALLEL = 2
@@ -50,14 +52,16 @@ class ParallelMode(object):
 class CommunicateTopology(object):
+    def __init__(
-    def __init__(self,
+        self,
        hybrid_group_names=["data", "pipe", "sharding", "model"],
-                 dims=[1, 1, 1, 1]):
+        dims=[1, 1, 1, 1],
+    ):
        self._parallel_names = hybrid_group_names
        self._dims = dims
-        self.coordinate = collections.namedtuple('Coordinate',
+        self.coordinate = collections.namedtuple(
-                                                 self._parallel_names)
+            'Coordinate', self._parallel_names
+        )
        self._world_size = reduce(lambda x, y: x * y, self._dims)
        ranges = [range(d) for d in self._dims]
@@ -65,7 +69,8 @@ class CommunicateTopology(object):
        self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
        self._rank2coord = dict(
-            zip(self._coord2rank.values(), self._coord2rank.keys()))
+            zip(self._coord2rank.values(), self._coord2rank.keys())
+        )
    def get_hybrid_group_names(self):
        return self._parallel_names
@@ -90,7 +95,8 @@ class CommunicateTopology(object):
    def get_axis_list(self, axis_name, index):
        axis = self._parallel_names.index(axis_name)
        ranks = [
-            self._coord2rank[coord] for coord in self._coord2rank.keys()
+            self._coord2rank[coord]
+            for coord in self._coord2rank.keys()
            if coord[axis] == index
        ]
        ranks.sort()
@@ -132,7 +138,6 @@ class CommunicateTopology(object):
 class HybridCommunicateGroup(object):
    def __init__(self, topology):
        self.nranks = paddle.distributed.get_world_size()
        self.global_rank = paddle.distributed.get_rank()
@@ -148,10 +153,16 @@ class HybridCommunicateGroup(object):
        self._sharding_parallel_id = self._get_sharding_parallel_id()
        self.stage_id = self._get_pipe_parallel_id()
-        assert self._check_vaild_topo(
+        assert self._check_vaild_topo(), (
-        ), "Here is an unreasonable topogy setting. world_size: {}, but" \
+            "Here is an unreasonable topogy setting. world_size: {}, but"
-            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(self.nranks,
+            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(
-            self._mp_degree, self._sharding_degree, self._pp_degree, self._dp_degree)
+                self.nranks,
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
        # create comm group for data parallel
        self._dp_group, self._dp_comm_group = self._set_comm_group("data")
@@ -164,26 +175,43 @@ class HybridCommunicateGroup(object):
        # create comm group for sharding parallel
        self._sharding_group, self._sharding_comm_group = self._set_comm_group(
-            "sharding")
+            "sharding"
+        )
        # create global group for check inf_nan / clip global norm
        self._check_group, self._check_comm_group = self._set_check_group(
-            "data")
+            "data"
+        )
        # create p2p group
-        self.is_first_stage = (self.stage_id == 0)
+        self.is_first_stage = self.stage_id == 0
-        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+        self.is_last_stage = self.stage_id == (self._pp_degree - 1)
        # create p2p_groups
        if self._pp_degree > 1:
            self._set_p2p_group()
-        debug_str = "HybridParallelInfo: rank_id: %d, mp_degree: %d, " \
+        debug_str = (
-                    "sharding_degree: %d, pp_degree: %d, dp_degree: %d" % (self.global_rank, self._mp_degree,
+            "HybridParallelInfo: rank_id: %d, mp_degree: %d, "
-                    self._sharding_degree, self._pp_degree, self._dp_degree)
+            "sharding_degree: %d, pp_degree: %d, dp_degree: %d"
-        debug_str += ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s" % (
+            % (
-            self._mp_group, self._sharding_group, self._pp_group,
+                self.global_rank,
-            self._dp_group, self._check_group)
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
+        debug_str += (
+            ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s"
+            % (
+                self._mp_group,
+                self._sharding_group,
+                self._pp_group,
+                self._dp_group,
+                self._check_group,
+            )
+        )
        logger.info(debug_str)
        global _HYBRID_PARALLEL_GROUP
@@ -195,7 +223,12 @@ class HybridCommunicateGroup(object):
        # adding its parallel logic within that parallelism
        # when use sharding alone, it should have its own parallelism for its parallel logic
        # TODO modify 3 others parallel to support sharding
-        if self._mp_degree == 1 and self._pp_degree == 1 and self._dp_degree == 1 and self._sharding_degree > 1:
+        if (
+            self._mp_degree == 1
+            and self._pp_degree == 1
+            and self._dp_degree == 1
+            and self._sharding_degree > 1
+        ):
            return ParallelMode.SHARDING_PARALLEL
        elif self._mp_degree == 1 and self._pp_degree == 1:
            return ParallelMode.DATA_PARALLEL
@@ -206,7 +239,13 @@ class HybridCommunicateGroup(object):
            return ParallelMode.PIPELINE_PARALLEL
    def _check_vaild_topo(self):
-        return self._dp_degree * self._mp_degree * self._pp_degree * self._sharding_degree == self.nranks
+        return (
+            self._dp_degree
+            * self._mp_degree
+            * self._pp_degree
+            * self._sharding_degree
+            == self.nranks
+        )
    def _set_comm_group(self, parallel_method="data"):
        parallel_group = []
@@ -268,14 +307,16 @@ class HybridCommunicateGroup(object):
                    self.prev_rank = prev_rank
                next_group = paddle.distributed.new_group(
-                    ranks=[curr_rank, next_rank])
+                    ranks=[curr_rank, next_rank]
+                )
                if self.global_rank == curr_rank:
                    self.send_next_group = next_group
                elif self.global_rank == next_rank:
                    self.recv_prev_group = next_group
                prev_group = paddle.distributed.new_group(
-                    ranks=[prev_rank, curr_rank])
+                    ranks=[prev_rank, curr_rank]
+                )
                if self.global_rank == curr_rank:
                    self.send_prev_group = prev_group
@@ -339,7 +380,12 @@ class HybridCommunicateGroup(object):
        return self._pp_comm_group
    def get_p2p_groups(self):
-        return self.send_next_group, self.send_prev_group, self.recv_next_group, self.recv_prev_group
+        return (
+            self.send_next_group,
+            self.send_prev_group,
+            self.recv_next_group,
+            self.recv_prev_group,
+        )
    # sharding parallel message:
    def _get_sharding_parallel_id(self):
@@ -363,23 +409,25 @@ class HybridCommunicateGroup(object):
        return self._check_comm_group
    def get_rank_from_stage(self, stage_id, **kwargs):
-        return self._topo.get_rank_from_stage(self.global_rank,
+        return self._topo.get_rank_from_stage(
-                                              pipe=stage_id,
+            self.global_rank, pipe=stage_id, **kwargs
-                                              **kwargs)
+        )
 class _CommunicateGroup(object):
-    """ tmp for static """
+    """tmp for static"""
    def __init__(self):
        global _HYBRID_PARALLEL_GROUP
        _HYBRID_PARALLEL_GROUP = self
        self.groups = dict()
-    def set_comm_group(self, group_name, group_rank, group_size, ring_id,
+    def set_comm_group(
-                       group_ranks):
+        self, group_name, group_rank, group_size, ring_id, group_ranks
-        group = paddle.distributed.collective.Group(group_rank, ring_id,
+    ):
-                                                    group_ranks)
+        group = paddle.distributed.collective.Group(
+            group_rank, ring_id, group_ranks
+        )
        self.groups[group_name] = group
    def get_group(self, group_name):

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -103,6 +103,7 @@ def _check_var_exists(var_name):
 def init_parallel_env():
    """
    Initialize parallel training environment in dynamic graph mode.
    Note:
@@ -118,6 +119,7 @@ def init_parallel_env():
    Examples:
        .. code-block:: python
            # required: gpu
            import paddle
            import paddle.nn as nn
@@ -158,6 +160,7 @@ def init_parallel_env():
            if __name__ == '__main__':
                dist.spawn(train)
    """
    # 0. get env & check world size

--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -23,9 +23,9 @@ from ...log_helper import get_logger
 __all__ = ['add_supported_layer']
-_logger = get_logger(__name__,
+_logger = get_logger(
-                     logging.INFO,
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+)
 def _default_pruning(weight_nparray, m, n, func_name, param_name):
@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
    exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
    if exlude_cond_shape2:
        _logger.warning(
-            '{} is not pruned because the first dimension of {} is smaller than {}'
+            '{} is not pruned because the first dimension of {} is smaller than {}'.format(
-            .format(param_name, shape, m))
+                param_name, shape, m
+            )
+        )
        return weight_pruned_nparray, weight_sparse_mask
    if exlude_cond_shape4:
        _logger.warning(
-            '{} is not pruned because the second dimension of {} is smaller than {}'
+            '{} is not pruned because the second dimension of {} is smaller than {}'.format(
-            .format(param_name, shape, m))
+                param_name, shape, m
+            )
+        )
        return weight_pruned_nparray, weight_sparse_mask
    checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight
    # matrices beforce invoking create_mask. Then we transpose the result mask to make
    # sure its shape to be the same as the input weight.
-    weight_sparse_mask = sparsity.create_mask(weight_nparray.T,
+    weight_sparse_mask = sparsity.create_mask(
-                                              func_name=func_name,
+        weight_nparray.T, func_name=func_name, n=n, m=m
-                                              n=n,
+    ).T
-                                              m=m).T
    weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
-    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+    assert sparsity.check_sparsity(
-                    'Pruning {} weight matrix failure!!!'.format(param_name)
+        weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name
+    ), 'Pruning {} weight matrix failure!!!'.format(param_name)
    return weight_pruned_nparray, weight_sparse_mask
@@ -78,6 +82,7 @@ supported_layers_and_prune_func_map = {}
 def add_supported_layer(layer, pruning_func=None):
    r"""
    Add supported layers and its corresponding pruning function.
    Args:
@@ -87,19 +92,25 @@ def add_supported_layer(layer, pruning_func=None):
        pruning_func (function, optional): a function type which receives five argument (weight_nparray,
                                           m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
                                           m, n, and func_name, please see `prune_model` for details.
    """
    name = None
    if isinstance(layer, str):
        name = layer
    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            type(layer).__name__)
+            type(layer).__name__
+        )
    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            layer.__name__)
+            layer.__name__
+        )
    else:
-        assert "The type of layer should be string of Layer, but got {}!".format(
+        assert (
-            type(layer))
+            "The type of layer should be string of Layer, but got {}!".format(
+                type(layer)
+            )
+        )
    if pruning_func is None:
        pruning_func = _default_pruning
    _supported_layers_and_prune_func_map_lock.acquire()

--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -27,9 +27,16 @@ from itertools import permutations
 import threading
 __all__ = [
-    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'calculate_density',
-    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
+    'check_mask_1d',
-    'MaskAlgo', 'CheckMethod'
+    'get_mask_1d',
+    'check_mask_2d',
+    'get_mask_2d_greedy',
+    'get_mask_2d_best',
+    'create_mask',
+    'check_sparsity',
+    'MaskAlgo',
+    'CheckMethod',
 ]
@@ -76,8 +83,9 @@ class CheckMethod(Enum):
            CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
            # CheckMethod.CHECK_2D
        """
-        assert isinstance(mask_algo, MaskAlgo), \
+        assert isinstance(
-               "mask_algo should be MaskAlgo type"
+            mask_algo, MaskAlgo
+        ), "mask_algo should be MaskAlgo type"
        if mask_algo == MaskAlgo.MASK_1D:
            return CheckMethod.CHECK_1D
        else:
@@ -86,20 +94,25 @@ class CheckMethod(Enum):
 def calculate_density(x):
    r"""
    Return the density of the input tensor.
    Args:
        x (nparray): The input tensor.
    Returns:
-        float: The density of :attr:`x`.
+        float, The density of :attr:`x`.
    Examples:
        .. code-block:: python
            import paddle
            import numpy as np
            x = np.array([[0, 1, 3, 0],
                        [1, 1, 0, 1]])
            paddle.incubate.asp.calculate_density(x) # 0.625
    """
    x_flattened = x.flatten()
    return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
@@ -126,7 +139,7 @@ def _reshape_1d(mat, m):
    remainder = mat.shape[1] % m
    if mat.shape[1] % m > 0:
        mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
-        mat_padded[:, :mat.shape[1]] = mat
+        mat_padded[:, : mat.shape[1]] = mat
        shape = mat_padded.shape
        return mat_padded.reshape(-1, m), shape
    else:
@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m):
        min_order_indices = np.argsort(np.absolute(sub_mat))
        mask_flattern[i, min_order_indices[:n].tolist()] = 0
    mask_flattern = mask_flattern.reshape(shape)
-    mask[:, :] = mask_flattern[:, :mat.shape[1]]
+    mask[:, :] = mask_flattern[:, : mat.shape[1]]
    return mask
@@ -239,12 +252,12 @@ def _reshape_2d(mat, m):
    remainder_0 = mat.shape[0] % m
    remainder_1 = mat.shape[1] % m
-    new_shape = (mat.shape[0] if remainder_0 == 0 \
+    new_shape = (
-                 else mat.shape[0] + (m - remainder_0),
+        mat.shape[0] if remainder_0 == 0 else mat.shape[0] + (m - remainder_0),
-                 mat.shape[1] if remainder_1 == 0 \
+        mat.shape[1] if remainder_1 == 0 else mat.shape[1] + (m - remainder_1),
-                 else mat.shape[1] + (m - remainder_1))
+    )
    mat_padded = np.zeros(new_shape)
-    mat_padded[:mat.shape[0], :mat.shape[1]] = mat
+    mat_padded[: mat.shape[0], : mat.shape[1]] = mat
    mat_flattern = np.empty(new_shape).reshape(-1, m * m)
    curr_idx = 0
@@ -252,9 +265,9 @@ def _reshape_2d(mat, m):
        row_end = row_start + m
        for col_start in range(0, mat_padded.shape[1], m):
            col_end = col_start + m
-            sub_mat = np.squeeze(mat_padded[row_start:row_end, \
+            sub_mat = np.squeeze(
-                                            col_start:col_end] \
+                mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
-                                            .reshape(-1))
+            )
            mat_flattern[curr_idx] = sub_mat
            curr_idx += 1
    return mat_flattern, mat_padded.shape
@@ -304,8 +317,9 @@ def check_mask_2d(mat, n, m):
    mat_padded, shape = _reshape_2d(mat, m)
    for sub_mat in mat_padded:
        sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
-        if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
+        if (np.sum(np.sum(sub_mask, axis=1) > (m - n)) != 0) and (
-            (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
+            np.sum(np.sum(sub_mask, axis=0) > (m - n)) != 0
+        ):
            return False
    return True
@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m):
        sub_mask = np.squeeze(mask_padded[idx])
        min_order_1d_indices = np.argsort(sub_mat)
-        min_order_2d_indices = [(int(x / m), x % m)
+        min_order_2d_indices = [
-                                for x in min_order_1d_indices]
+            (int(x / m), x % m) for x in min_order_1d_indices
+        ]
        row_counter = collections.Counter()
        col_counter = collections.Counter()
        for i in range(len(min_order_1d_indices) - 1, -1, -1):
            matrix_entry = min_order_2d_indices[i]
-            if (row_counter[matrix_entry[0]] == n) or \
+            if (row_counter[matrix_entry[0]] == n) or (
-               (col_counter[matrix_entry[1]] == n):
+                col_counter[matrix_entry[1]] == n
+            ):
                continue
            sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m):
            col_end = col_start + m
            mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
            curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 _valid_2d_patterns_lock = threading.Lock()
@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m):
        patterns = patterns + patterns
        patterns = np.asarray(list(set(permutations(patterns, m))))
-        valid = ((patterns.sum(axis=1) <= n).sum(
+        valid = (
-            axis=1) == m).nonzero()[0].reshape(-1)
+            ((patterns.sum(axis=1) <= n).sum(axis=1) == m)
+            .nonzero()[0]
+            .reshape(-1)
+        )
        valid_patterns = np.empty((valid.shape[0], m, m))
        valid_patterns[:] = patterns[valid[:]]
@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m):
    mat_flattern, shape = _reshape_2d(mat, m)
    mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
-    pmax = np.argmax(np.matmul(mat_flattern,
+    pmax = np.argmax(
-                               patterns.reshape(patterns.shape[0], m * m).T),
+        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
-                     axis=1)
+        axis=1,
+    )
    mask_flattern[:] = patterns[pmax[:]]
    mask = np.empty(shape)
@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m):
            col_end = col_start + m
            mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
            curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
    dtype = tensor.dtype
    t = tensor.astype(float)
-    assert isinstance(func_name, MaskAlgo), \
+    assert isinstance(func_name, MaskAlgo), (
-           "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
+        "func_name argumet of create_mask is only accepted as type MaskAlgo. "
        "But got {}".format(type(func_name))
+    )
    func = getattr(sys.modules[__name__], func_name.value, None)
    if len(shape) == 1:
        t = t.reshape(1, shape[0])
@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
        t = t.reshape(shape[0] * shape[1], shape[2])
    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
    elif len(shape) == 4:
-        t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3],
+        t = t.transpose([0, 1, 3, 2]).reshape(
-                                              shape[2])
+            shape[0] * shape[1] * shape[3], shape[2]
+        )
        mask = func(t, n=n, m=m)
-        return mask.reshape([shape[0], shape[1], shape[3],
+        return (
-                             shape[2]]).transpose([0, 1, 3, 2]).astype(dtype)
+            mask.reshape([shape[0], shape[1], shape[3], shape[2]])
+            .transpose([0, 1, 3, 2])
+            .astype(dtype)
+        )
    else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+        raise ValueError(
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
    mask = func(t, n=n, m=m)
    return mask.reshape(shape).astype(dtype)
@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
    shape = tensor.shape
    t = tensor.astype(float)
-    assert type(func_name) == CheckMethod, \
+    assert type(func_name) == CheckMethod, (
-           "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
+        "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
        "But got {}".format(type(func_name))
+    )
    func = getattr(sys.modules[__name__], func_name.value, None)
    if len(shape) == 1:
        t = t.reshape(1, shape[0])
@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
        t = t.reshape(shape[0] * shape[1], shape[2])
    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
    elif len(shape) == 4:
-        t = t.transpose([0, 1, 3,
+        t = t.transpose([0, 1, 3, 2]).reshape(
-                         2]).reshape([shape[0] * shape[1] * shape[3], shape[2]])
+            [shape[0] * shape[1] * shape[3], shape[2]]
+        )
    else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+        raise ValueError(
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
    return func(t, n=n, m=m)
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1352,12 +1352,13 @@ class ParameterMetaClass(VariableMetaClass):
 @six.add_metaclass(VariableMetaClass)
 class Variable(object):
    """
-    **Notes**:
-        **The constructor of Variable should not be invoked directly.**
-        **In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.**
+    Notes:
+        The constructor of Variable should not be invoked directly.
+        In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.
-        **In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data**
+        In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data.
    In Fluid, every input and output of an OP is a variable. In most
    cases, variables are used for holding different kinds of data or training
@@ -1514,12 +1515,13 @@ class Variable(object):
    def detach(self):
        """
        Returns a new Variable, detached from the current graph.
        It will share data with origin Variable and without tensor copy.
        In addition, the detached Variable doesn't provide gradient propagation.
        Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable.
        Examples:
            .. code-block:: python
@@ -1533,6 +1535,7 @@ class Variable(object):
                # create a detached Variable
                y = x.detach()
        """
        assert (
@@ -2085,6 +2088,7 @@ class Variable(object):
    @property
    def T(self):
        """
        Permute current Variable with its dimensions reversed.
        If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
@@ -2103,6 +2107,7 @@ class Variable(object):
                x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
                print(x_T_np.shape)
                # (5, 3, 2)
        """
        if len(self.shape) == 1:
            return self
@@ -2141,7 +2146,7 @@ class Variable(object):
        as ``out = assign(tensor)`` .
        Returns:
-            Variable: The cloned Variable.
+            Variable, The cloned Variable.
        Examples:
            .. code-block:: python
@@ -2171,6 +2176,7 @@ class Variable(object):
    def _set_error_clip(self, error_clip):
        """
        Set the error_clip.
        Args:
@@ -2178,11 +2184,13 @@ class Variable(object):
        Returns:
            None
        """
        self.error_clip = error_clip
    def _set_info(self, key, value):
        """
        Set key-value information for this variable.
        Args:
@@ -2191,6 +2199,7 @@ class Variable(object):
        Returns:
            None
        """
        if not hasattr(self, "_info"):
            self._info = {}
@@ -2198,6 +2207,7 @@ class Variable(object):
    def _get_info(self, key):
        """
        Get the information of this variable corresponding to key.
        Args:
@@ -2205,6 +2215,7 @@ class Variable(object):
        Returns:
            object
        """
        if hasattr(self, "_info") and key in self._info:
            return self._info[key]
@@ -2212,7 +2223,9 @@ class Variable(object):
    def _slice_indices(self, slice, length):
        """
        Reference implementation for the slice.indices method.
        """
        # Compute step and length as integers.
        step = 1 if slice.step is None else slice.step
@@ -2383,7 +2396,7 @@ class Variable(object):
                Default: None
        Returns:
-            Tensor: the value in given scope.
+            Tensor, the value in given scope.
        Examples:
            .. code-block:: python
@@ -2438,6 +2451,7 @@ class Variable(object):
    def set_value(self, value, scope=None):
        '''
        Set the value to the tensor in given scope.
        Args:
@@ -2477,6 +2491,7 @@ class Variable(object):
                    if var.persistable:
                        t_load = paddle.load(path+var.name+'.pdtensor')
                        var.set_value(t_load)
        '''
        # The 'framework' is a low-level module, and 'executor'
@@ -2547,10 +2562,11 @@ class Variable(object):
    def size(self):
        """
        Returns the number of elements for current Variable, which is a int64 Variable with shape [1]
        Returns:
-            Variable: the number of elements for current Variable
+            Variable, the number of elements for current Variable
        Examples:
            .. code-block:: python
@@ -2564,6 +2580,7 @@ class Variable(object):
                # get the number of elements of the Variable
                y = x.size()
        """
        output = self.block.create_var(
@@ -2578,23 +2595,27 @@ class Variable(object):
    def _set_attr(self, name, val):
        """
        Set the value of attribute by attribute's name.
        Args:
            name(str): the attribute name.
            val(int|str|list): the value of the attribute.
        """
        self._update_desc_attr(name, val)
    def _has_attr(self, name):
        """
        Whether this Variable has the attribute with the name `name` or not.
        Args:
            name(str): the attribute name.
        Returns:
-            bool: True if has this attribute.
+            bool, True if has this attribute.
        """
        return self.desc.has_attr(name)
@@ -2624,7 +2645,7 @@ class Variable(object):
            name(str): the attribute name.
        Returns:
-            int|str|list: The attribute value. The return value
+            int|str|list, The attribute value. The return value
            can be any valid attribute type.
        """
        return self.desc.attr(name)
@@ -3196,14 +3217,16 @@ class Operator(object):
    def input(self, name):
        r"""
        Get the input arguments according to the input parameter name.
        Args:
            name(str): The input parameter name.
        Returns:
-            list: return the list of argument names that associated with \
+            list, return the list of argument names that associated with \
                the specific parameter name.
        """
        return self.desc.input(name)

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,13 @@ from __future__ import print_function
 import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
-from ..framework import Variable, _non_static_mode, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Variable,
+    _non_static_mode,
+    _varbase_creator,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from .. import core
 from ..param_attr import ParamAttr
 from . import nn
@@ -33,22 +39,29 @@ __all__ = ['accuracy', 'auc']
 def accuracy(input, label, k=1, correct=None, total=None):
    """
    accuracy layer.
    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
    This function computes the accuracy using the input and label.
    If the correct label occurs in top k predictions, then correct will increment by one.
-    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+    Note:
+        the dtype of accuracy is determined by input. the input and label dtype can be different.
    Args:
        input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
            The shape is ``[sample_number, class_dim]`` .
        label(Tensor): The label of dataset.  Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
-        k(int): The top k predictions for each class will be checked. Data type is int64 or int32.
+        k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1.
-        correct(Tensor): The correct predictions count. A Tensor with type int64 or int32.
+        correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None.
-        total(Tensor): The total entries count. A tensor with type int64 or int32.
+        total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None.
    Returns:
-        Tensor: The correct rate. A Tensor with type float32.
+        Tensor, The correct rate. A Tensor with type float32.
    Examples:
        .. code-block:: python
            import numpy as np
            import paddle
            import paddle.static as static
@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
                        fetch_list=[result[0]])
            print(output)
            #[array([0.], dtype=float32)]
    """
    if _non_static_mode():
        if correct is None:
@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None):
            total = _varbase_creator(dtype="int32")
        _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k,
+        topk_out, topk_indices = _legacy_C_ops.top_k_v2(
-                                                        'sorted', False)
+            input, 'k', _k, 'sorted', False
-        _acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
+        )
-                                            correct, total)
+        _acc, _, _ = _legacy_C_ops.accuracy(
+            topk_out, topk_indices, label, correct, total
+        )
        return _acc
    helper = LayerHelper("accuracy", **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
+    check_variable_and_dtype(
-                             'accuracy')
+        input, 'input', ['float16', 'float32', 'float64'], 'accuracy'
+    )
    topk_out = helper.create_variable_for_type_inference(dtype=input.dtype)
    topk_indices = helper.create_variable_for_type_inference(dtype="int64")
    inputs = {"X": [input]}
@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None):
    else:
        attrs = {'k': k}
    attrs['sorted'] = False
-    helper.append_op(type="top_k_v2",
+    helper.append_op(
+        type="top_k_v2",
        inputs=inputs,
        attrs=attrs,
-                     outputs={
+        outputs={"Out": [topk_out], "Indices": [topk_indices]},
-                         "Out": [topk_out],
+    )
-                         "Indices": [topk_indices]
-                     })
    acc_out = helper.create_variable_for_type_inference(dtype="float32")
    if correct is None:
        correct = helper.create_variable_for_type_inference(dtype="int32")
    if total is None:
        total = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(type="accuracy",
+    helper.append_op(
-                     inputs={
+        type="accuracy",
-                         "Out": [topk_out],
+        inputs={"Out": [topk_out], "Indices": [topk_indices], "Label": [label]},
-                         "Indices": [topk_indices],
-                         "Label": [label]
-                     },
        outputs={
            "Accuracy": [acc_out],
            "Correct": [correct],
            "Total": [total],
-                     })
+        },
+    )
    return acc_out
-def auc(input,
+def auc(
+    input,
    label,
    curve='ROC',
    num_thresholds=2**12 - 1,
    topk=1,
    slide_steps=1,
-        ins_tag_weight=None):
+    ins_tag_weight=None,
+):
    """
    **Area Under the Curve (AUC) Layer**
@@ -216,13 +232,14 @@ def auc(input,
    helper = LayerHelper("auc", **locals())
    if ins_tag_weight is None:
-        ins_tag_weight = tensor.fill_constant(shape=[1, 1],
+        ins_tag_weight = tensor.fill_constant(
-                                              dtype="float32",
+            shape=[1, 1], dtype="float32", value=1.0
-                                              value=1.0)
+        )
    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc')
    check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc')
-    check_variable_and_dtype(ins_tag_weight, 'ins_tag_weight',
+    check_variable_and_dtype(
-                             ['float32', 'float64'], 'auc')
+        ins_tag_weight, 'ins_tag_weight', ['float32', 'float64'], 'auc'
+    )
    auc_out = helper.create_variable_for_type_inference(dtype="float64")
    batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
@@ -236,62 +253,71 @@ def auc(input,
    batch_stat_pos = helper.create_global_variable(
        persistable=True,
        dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
    batch_stat_neg = helper.create_global_variable(
        persistable=True,
        dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
    # for global auc
    # Needn't maintain the batch id
-    stat_pos = helper.create_global_variable(persistable=True,
+    stat_pos = helper.create_global_variable(
-                                             dtype='int64',
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
-                                             shape=[1, num_thresholds + 1])
+    )
-    stat_neg = helper.create_global_variable(persistable=True,
+    stat_neg = helper.create_global_variable(
-                                             dtype='int64',
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
-                                             shape=[1, num_thresholds + 1])
+    )
    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
-        helper.set_variable_initializer(var, Constant(value=0.0,
+        helper.set_variable_initializer(
-                                                      force_cpu=False))
+            var, Constant(value=0.0, force_cpu=False)
+        )
-    #"InsTagWeight": [ins_tag_weight]
+    # "InsTagWeight": [ins_tag_weight]
    # Batch AUC
-    helper.append_op(type="auc",
+    helper.append_op(
+        type="auc",
        inputs={
            "Predict": [input],
            "Label": [label],
            "StatPos": [batch_stat_pos],
-                         "StatNeg": [batch_stat_neg]
+            "StatNeg": [batch_stat_neg],
        },
        attrs={
            "curve": curve,
            "num_thresholds": num_thresholds,
-                         "slide_steps": slide_steps
+            "slide_steps": slide_steps,
        },
        outputs={
            "AUC": [batch_auc_out],
            "StatPosOut": [batch_stat_pos],
-                         "StatNegOut": [batch_stat_neg]
+            "StatNegOut": [batch_stat_neg],
-                     })
+        },
+    )
    # Global AUC
-    helper.append_op(type="auc",
+    helper.append_op(
+        type="auc",
        inputs={
            "Predict": [input],
            "Label": [label],
            "StatPos": [stat_pos],
-                         "StatNeg": [stat_neg]
+            "StatNeg": [stat_neg],
        },
        attrs={
            "curve": curve,
            "num_thresholds": num_thresholds,
-                         "slide_steps": 0
+            "slide_steps": 0,
        },
        outputs={
            "AUC": [auc_out],
            "StatPosOut": [stat_pos],
-                         "StatNegOut": [stat_neg]
+            "StatNegOut": [stat_neg],
-                     })
+        },
-    return auc_out, batch_auc_out, [
+    )
-        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
+    return (
-    ]
+        auc_out,
+        batch_auc_out,
+        [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg],
+    )
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -241,13 +241,13 @@ def send_ue_recv(
        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
                            The available data type is int32, int64.
-        message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
+        message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
-        reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
+        reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                         Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
+        out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
                                    out_size is smaller or equal to 0, then this input will not be used.
                                    Otherwise, `out_size` should be equal with or larger than
-                                    max(dst_index) + 1.
+                                    max(dst_index) + 1. Default value is `None`.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
@@ -26,6 +26,7 @@ def reindex_graph(
    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
    """
    Reindex Graph API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -49,12 +50,12 @@ def reindex_graph(
                            should be the same with `x`.
        count (Tensor): The neighbor count of the input nodes `x`. And the
                        data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                    and should be filled with -1. Only useful for gpu version.
                                    `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -69,6 +70,7 @@ def reindex_graph(
        .. code-block:: python
            import paddle
            x = [0, 1, 2]
            neighbors = [8, 9, 0, 4, 7, 6, 7]
            count = [2, 3, 2]
@@ -138,6 +140,7 @@ def reindex_heter_graph(
    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
    """
    Reindex HeterGraph API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -161,12 +164,12 @@ def reindex_heter_graph(
                                The data type should be the same with `x`.
        count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
                            And the data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                    and should be filled with -1. Only useful for gpu version.
                                    `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -183,6 +186,7 @@ def reindex_heter_graph(
        .. code-block:: python
            import paddle
            x = [0, 1, 2]
            neighbors_a = [8, 9, 0, 4, 7, 6, 7]
            count_a = [2, 3, 2]

--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -32,6 +32,7 @@ def sample_neighbors(
    name=None,
 ):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -52,16 +53,16 @@ def sample_neighbors(
                         The data type should be the same with `row`.
        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                              data type should be the same with `row`.
-        sample_size (int): The number of neighbors we need to sample. Default value is -1,
+        sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
                           which means returning all the neighbors of the input nodes.
-        eids (Tensor): The eid information of the input graph. If return_eids is True,
+        eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
                            then `eids` should not be None. The data type should be the
                            same with `row`. Default is None.
-        return_eids (bool): Whether to return eid information of sample edges. Default is False.
+        return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
-        perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
+        perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
                              is True, then `perm_buffer` should not be None. The data type should
                              be the same with `row`. If not None, we will use fiser-yates sampling
-                              to speed up. Only useful for gpu version.
+                              to speed up. Only useful for gpu version. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -78,6 +79,7 @@ def sample_neighbors(
        .. code-block:: python
            import paddle
            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm(
    name=None,
 ):
    r"""
    The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
    .. code-block:: python
        y = layer_norm(residual + dropout(bias + x))
    Parameters:
@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor: The output Tensor, the data type and shape is same as `x`.
+        Tensor, The output Tensor, the data type and shape is same as `x`.
    Examples:
        .. code-block:: python
            # required: gpu
@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm(
                x, residual, bias)
            # [2, 4, 128]
            print(output.shape)
    """
    seed = None
    if mode not in ('downscale_in_infer', 'upscale_in_train'):

--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -20,14 +20,17 @@ from paddle.fluid import core
 from paddle import _C_ops, _legacy_C_ops
-def graph_khop_sampler(row,
+def graph_khop_sampler(
+    row,
    colptr,
    input_nodes,
    sample_sizes,
    sorted_eids=None,
    return_eids=False,
-                       name=None):
+    name=None,
+):
    """
    Graph Khop Sampler API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -50,24 +53,23 @@ def graph_khop_sampler(row,
        sample_sizes (list|tuple): The number of neighbors and number of layers we want
                                   to sample. The data type should be int, and the shape
                                   should only have one dimension.
-        sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids`
+        sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
                              is True. The shape should be [num_edges, 1], and the data
-                              type should be the same with `row`.
+                              type should be the same with `row`. Default is None.
-        return_eids (bool): Whether to return the id of the sample edges. Default is False.
+        return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        edge_src (Tensor): The src index of the output edges, also means the first column of 
+        - edge_src (Tensor), The src index of the output edges, also means the first column of
          the edges. The shape is [num_sample_edges, 1] currently.
-        edge_dst (Tensor): The dst index of the output edges, also means the second column
+        - edge_dst (Tensor), The dst index of the output edges, also means the second column
          of the edges. The shape is [num_sample_edges, 1] currently.
-        sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes.
+        - sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
-        reindex_nodes (Tensor): The reindex id of the input nodes.
+        - reindex_nodes (Tensor), The reindex id of the input nodes.
-        edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True.
+        - edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
    Examples:
        .. code-block:: python
            import paddle
@@ -80,44 +82,72 @@ def graph_khop_sampler(row,
            colptr = paddle.to_tensor(colptr, dtype="int64")
            nodes = paddle.to_tensor(nodes, dtype="int64")
-        edge_src, edge_dst, sample_index, reindex_nodes = \
+            edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
-            paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
    """
    if _non_static_mode():
        if return_eids:
            if sorted_eids is None:
-                raise ValueError(f"`sorted_eid` should not be None "
+                raise ValueError(
-                                 f"if return_eids is True.")
+                    f"`sorted_eid` should not be None "
-            edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
+                    f"if return_eids is True."
-                _legacy_C_ops.graph_khop_sampler(row, sorted_eids,
+                )
-                                              colptr, input_nodes,
+            (
-                                              "sample_sizes", sample_sizes,
+                edge_src,
-                                              "return_eids", True)
+                edge_dst,
+                sample_index,
+                reindex_nodes,
+                edge_eids,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                sorted_eids,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                True,
+            )
            return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
        else:
-            edge_src, edge_dst, sample_index, reindex_nodes, _ = \
+            (
-                _legacy_C_ops.graph_khop_sampler(row, None,
+                edge_src,
-                                              colptr, input_nodes,
+                edge_dst,
-                                              "sample_sizes", sample_sizes,
+                sample_index,
-                                              "return_eids", False)
+                reindex_nodes,
+                _,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                None,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                False,
+            )
            return edge_src, edge_dst, sample_index, reindex_nodes
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_khop_sampler")
+        row, "Row", ("int32", "int64"), "graph_khop_sampler"
+    )
    if return_eids:
        if sorted_eids is None:
-            raise ValueError(f"`sorted_eid` should not be None "
+            raise ValueError(
-                             f"if return_eids is True.")
+                f"`sorted_eid` should not be None " f"if return_eids is True."
-        check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"),
+            )
-                                 "graph_khop_sampler")
+        check_variable_and_dtype(
+            sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
+        )
-                             "graph_khop_sampler")
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_khop_sampler")
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
+    )
    helper = LayerHelper("graph_khop_sampler", **locals())
    edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
    sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
    reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
    edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_khop_sampler",
+    helper.append_op(
+        type="graph_khop_sampler",
        inputs={
            "Row": row,
            "Eids": sorted_eids,
            "Col_Ptr": colptr,
-                         "X": input_nodes
+            "X": input_nodes,
        },
        outputs={
            "Out_Src": edge_src,
            "Out_Dst": edge_dst,
            "Sample_Index": sample_index,
            "Reindex_X": reindex_nodes,
-                         "Out_Eids": edge_eids
+            "Out_Eids": edge_eids,
        },
-                     attrs={
+        attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
-                         "sample_sizes": sample_sizes,
+    )
-                         "return_eids": return_eids
-                     })
    if return_eids:
        return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
    else:

--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops
 import paddle.utils.deprecated as deprecated
-@deprecated(since="2.4.0",
+@deprecated(
+    since="2.4.0",
    update_to="paddle.geometric.reindex_graph",
    level=1,
-            reason="paddle.incubate.graph_reindex will be removed in future")
+    reason="paddle.incubate.graph_reindex will be removed in future",
-def graph_reindex(x,
+)
+def graph_reindex(
+    x,
    neighbors,
    count,
    value_buffer=None,
    index_buffer=None,
    flag_buffer_hashtable=False,
-                  name=None):
+    name=None,
+):
    """
    Graph Reindex API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -40,7 +45,7 @@ def graph_reindex(x,
    is to reindex the ids information of the input nodes, and return the 
    corresponding graph edges after reindex.
-    **Notes**: 
+    Notes:
        The number in x should be unique, otherwise it would cause potential errors.
        Besides, we also support multi-edge-types neighbors reindexing. If we have different
        edge_type neighbors for x, we should concatenate all the neighbors and count of x.
@@ -58,24 +63,23 @@ def graph_reindex(x,
                            should be the same with `x`.
        count (Tensor): The neighbor count of the input nodes `x`. And the 
                        data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should 
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
-                                    be int32, and should be filled with -1.
+                                    be int32, and should be filled with -1. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should 
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
-                                    be int32, and should be filled with -1.
+                                    be int32, and should be filled with -1. Default is None.
-        flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up.
+        flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
                                      Default is False. Only useful for gpu version currently.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        reindex_src (Tensor): The source node index of graph edges after reindex.
+        - reindex_src (Tensor), The source node index of graph edges after reindex.
-        reindex_dst (Tensor): The destination node index of graph edges after reindex.
+        - reindex_dst (Tensor), The destination node index of graph edges after reindex.
-        out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
+        - out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
          where we put the input nodes `x` in the front, and put neighbor
          nodes in the back.
    Examples:
        .. code-block:: python
            import paddle
@@ -109,47 +113,55 @@ def graph_reindex(x,
    """
    if flag_buffer_hashtable:
        if value_buffer is None or index_buffer is None:
-            raise ValueError(f"`value_buffer` and `index_buffer` should not"
+            raise ValueError(
-                             "be None if `flag_buffer_hashtable` is True.")
+                f"`value_buffer` and `index_buffer` should not"
+                "be None if `flag_buffer_hashtable` is True."
+            )
    if _non_static_mode():
-        reindex_src, reindex_dst, out_nodes = \
+        reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
-            _legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
+            x,
-                                 "flag_buffer_hashtable", flag_buffer_hashtable)
+            neighbors,
+            count,
+            value_buffer,
+            index_buffer,
+            "flag_buffer_hashtable",
+            flag_buffer_hashtable,
+        )
        return reindex_src, reindex_dst, out_nodes
    check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
-    check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_reindex")
+        neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
+    )
    check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
    if flag_buffer_hashtable:
-        check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
+        check_variable_and_dtype(
-                                 "graph_reindex")
+            value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
-        check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
+        )
-                                 "graph_reindex")
+        check_variable_and_dtype(
+            index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
+        )
    helper = LayerHelper("graph_reindex", **locals())
    reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
    reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
    out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="graph_reindex",
+    helper.append_op(
+        type="graph_reindex",
        inputs={
-                         "X":
+            "X": x,
-                         x,
+            "Neighbors": neighbors,
-                         "Neighbors":
+            "Count": count,
-                         neighbors,
+            "HashTable_Value": value_buffer if flag_buffer_hashtable else None,
-                         "Count":
+            "HashTable_Index": index_buffer if flag_buffer_hashtable else None,
-                         count,
-                         "HashTable_Value":
-                         value_buffer if flag_buffer_hashtable else None,
-                         "HashTable_Index":
-                         index_buffer if flag_buffer_hashtable else None,
        },
        outputs={
            "Reindex_Src": reindex_src,
            "Reindex_Dst": reindex_dst,
-                         "Out_Nodes": out_nodes
+            "Out_Nodes": out_nodes,
        },
-                     attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
+        attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
+    )
    return reindex_src, reindex_dst, out_nodes
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -25,8 +25,10 @@ import paddle.utils.deprecated as deprecated
    since="2.4.0",
    update_to="paddle.geometric.sample_neighbors",
    level=1,
-    reason="paddle.incubate.graph_sample_neighbors will be removed in future")
+    reason="paddle.incubate.graph_sample_neighbors will be removed in future",
-def graph_sample_neighbors(row,
+)
+def graph_sample_neighbors(
+    row,
    colptr,
    input_nodes,
    eids=None,
@@ -34,8 +36,10 @@ def graph_sample_neighbors(row,
    sample_size=-1,
    return_eids=False,
    flag_perm_buffer=False,
-                           name=None):
+    name=None,
+):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -71,14 +75,13 @@ def graph_sample_neighbors(row,
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        out_neighbors (Tensor): The sample neighbors of the input nodes.
+        - out_neighbors (Tensor), The sample neighbors of the input nodes.
-        out_count (Tensor): The number of sampling neighbors of each input node, and the shape
+        - out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
-                            should be the same with `input_nodes`.
+        - out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the 
-                           sample edges.
    Examples:
        .. code-block:: python
            import paddle
            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
@@ -98,59 +101,83 @@ def graph_sample_neighbors(row,
    if return_eids:
        if eids is None:
            raise ValueError(
-                f"`eids` should not be None if `return_eids` is True.")
+                f"`eids` should not be None if `return_eids` is True."
+            )
    if flag_perm_buffer:
        if perm_buffer is None:
            raise ValueError(
                f"`perm_buffer` should not be None if `flag_perm_buffer`"
-                "is True.")
+                "is True."
+            )
    if _non_static_mode():
-        out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
+        (
-            row, colptr, input_nodes, eids, perm_buffer, "sample_size",
+            out_neighbors,
-            sample_size, "return_eids", return_eids, "flag_perm_buffer",
+            out_count,
-            flag_perm_buffer)
+            out_eids,
+        ) = _legacy_C_ops.graph_sample_neighbors(
+            row,
+            colptr,
+            input_nodes,
+            eids,
+            perm_buffer,
+            "sample_size",
+            sample_size,
+            "return_eids",
+            return_eids,
+            "flag_perm_buffer",
+            flag_perm_buffer,
+        )
        if return_eids:
            return out_neighbors, out_count, out_eids
        return out_neighbors, out_count
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_sample_neighbors")
+        row, "Row", ("int32", "int64"), "graph_sample_neighbors"
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
+    )
-                             "graph_sample_neighbors")
+    check_variable_and_dtype(
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
-                             "graph_sample_neighbors")
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
+    )
    if return_eids:
-        check_variable_and_dtype(eids, "Eids", ("int32", "int64"),
+        check_variable_and_dtype(
-                                 "graph_sample_neighbors")
+            eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
+        )
    if flag_perm_buffer:
-        check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"),
+        check_variable_and_dtype(
-                                 "graph_sample_neighbors")
+            perm_buffer,
+            "Perm_Buffer",
+            ("int32", "int64"),
+            "graph_sample_neighbors",
+        )
    helper = LayerHelper("graph_sample_neighbors", **locals())
    out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
    out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
    out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_sample_neighbors",
+    helper.append_op(
+        type="graph_sample_neighbors",
        inputs={
            "Row": row,
            "Col_Ptr": colptr,
            "X": input_nodes,
            "Eids": eids if return_eids else None,
-                         "Perm_Buffer":
+            "Perm_Buffer": perm_buffer if flag_perm_buffer else None,
-                         perm_buffer if flag_perm_buffer else None
        },
        outputs={
            "Out": out_neighbors,
            "Out_Count": out_count,
-                         "Out_Eids": out_eids
+            "Out_Eids": out_eids,
        },
        attrs={
            "sample_size": sample_size,
            "return_eids": return_eids,
-                         "flag_perm_buffer": flag_perm_buffer
+            "flag_perm_buffer": flag_perm_buffer,
-                     })
+        },
+    )
    if return_eids:
        return out_neighbors, out_count, out_eids
    return out_neighbors, out_count
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -715,6 +715,7 @@ def upsample(
    name=None,
 ):
    """
    This API resizes a batch of images.
    The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
@@ -725,11 +726,12 @@ def upsample(
    and the resizing only applies on the three dimensions(depth, height and width).
    Supporting resample methods:
-        'linear' : Linear interpolation
+    - 'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
+    - 'bilinear' : Bilinear interpolation
-        'trilinear' : Trilinear interpolation
+    - 'trilinear' : Trilinear interpolation
-        'nearest' : Nearest neighbor interpolation
+    - 'nearest' : Nearest neighbor interpolation
-        'bicubic' : Bicubic interpolation
+    - 'bicubic' : Bicubic interpolation
    Linear interpolation is the method of using a line connecting two known quantities
    to determine the value of an unknown quantity between the two known quantities.
@@ -831,8 +833,9 @@ def upsample(
                D_out = D_{in} * scale_{factor}
                H_out = H_{in} * scale_{factor}
                W_out = W_{in} * scale_{factor}
-    https://en.wikipedia.org/wiki/Linear_interpolation.
    For details of linear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Linear_interpolation.
    For details of nearest neighbor interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
@@ -876,6 +879,7 @@ def upsample(
        name(str, optional): The default value is None.
                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
        A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),

--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -23,6 +23,7 @@ __all__ = []
 def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
    r"""
    It computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:
@@ -48,6 +49,7 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
    Returns:
        Tensor, the dtype is same as input tensor.
        - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
          depending on whether the input has data shaped as :math:`[N, D]`.
        - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -1450,15 +1450,16 @@ class Maxout(Layer):
 class Softmax2D(Layer):
    r"""
    Softmax2D Activation.
    Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
    The sum of result in each location (C, H_i, W_j) will be one.
    Shape:
        - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
-        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input)
+        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)
-    Return:
+    Returns:
        A Tensor of the same shape and dtype as input with value in range [0, 1].
    Examples:
@@ -1483,6 +1484,7 @@ class Softmax2D(Layer):
            #   [[0.42368975 0.51082766 0.47752273 0.5258871 ]
            #    [0.66754097 0.47182566 0.5187628  0.5402329 ]
            #    [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
    """
    def __init__(self, name=None):

--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -20,6 +20,7 @@ __all__ = []
 class PairwiseDistance(Layer):
    r"""
    It computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:
@@ -38,10 +39,10 @@ class PairwiseDistance(Layer):
            Generally, no setting is required. Default: None.
    Shape:
-        x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
+        - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
          is the dimension of the data. Available data type is float32, float64.
-        y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
+        - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
-        output: The same dtype as input tensor.
+        - output: The same dtype as input tensor.
            - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
              depending on whether the input has data shaped as :math:`[N, D]`.
            - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -224,6 +224,7 @@ class AvgPool2D(Layer):
 class AvgPool3D(Layer):
    """
    This operation applies 3D max pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
    in NCDHW format, where N is batch size, C is the number of channels,
@@ -264,6 +265,7 @@ class AvgPool3D(Layer):
          The data type can be float32, float64.
        - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
          The data type is same as input x.
    Examples:
        .. code-block:: python

--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -514,14 +514,17 @@ class QuantizedConv2D(Layer):
 class QuantizedConv2DTranspose(Layer):
    """
    The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
    The only difference is that its inputs are all fake quantized.
    Examples:
       .. code-block:: python
          import paddle
          import paddle.nn as nn
          from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
          x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
          conv = nn.Conv2DTranspose(4, 6, (3, 3))
          conv_quantized = QuantizedConv2DTranspose(conv)
@@ -531,6 +534,7 @@ class QuantizedConv2DTranspose(Layer):
          y_np = y_var.numpy()
          print(y_np.shape, y_quantized_np.shape)
          # (2, 6, 10, 10), (2, 6, 10, 10)
    """
    def __init__(self,

--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
--- a/python/paddle/sparse/nn/layer/activation.py
+++ b/python/paddle/sparse/nn/layer/activation.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py