[cherry-pick2.4]en-docs warning&error fix (#48332)

* fixdocs, test=document_fix * fixdocs, test=document_fix

[cherry-pick2.4]en-docs warning&error fix (#48332)
* fixdocs, test=document_fix * fixdocs, test=document_fix
1490aaa9 · ustiniankw · GitHub · 3fa7a736 · 1490aaa9 · 1490aaa9
38 changed file
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -26,7 +26,6 @@ non_auto_func_called = True
 def __non_auto_func_called__(func):
    def __impl__(*args, **kwargs):
        global non_auto_func_called
        non_auto_func_called = False
@@ -112,6 +111,7 @@ class DistributedStrategy(object):
    def __init__(self):
        """
        DistributedStrategy is the main configuration entry for distributed training of Paddle.
        All of the distributed training configurations can be configured in DistributedStrategy,
        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
@@ -129,7 +129,8 @@ class DistributedStrategy(object):
        key = 'FLAGS_cudnn_batchnorm_spatial_persistent'
        if _global_flags().is_public(key):
            self.strategy.cudnn_batchnorm_spatial_persistent = bool(
-                _global_flags()[key])
+                _global_flags()[key]
+            )
        key = 'FLAGS_conv_workspace_size_limit'
        if _global_flags().is_public(key):
            self.strategy.conv_workspace_size_limit = int(_global_flags()[key])
@@ -144,16 +145,17 @@ class DistributedStrategy(object):
    def __setattr__(self, key, value):
        if self.__lock_attr and not hasattr(self, key):
-            raise TypeError("%s is not a attribute of %s" %
+            raise TypeError(
-                            (key, self.__class__.__name__))
+                "%s is not a attribute of %s" % (key, self.__class__.__name__)
+            )
        object.__setattr__(self, key, value)
    def save_to_prototxt(self, output):
        """
        Serialize current DistributedStrategy to string and save to output file
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -162,25 +164,28 @@ class DistributedStrategy(object):
                strategy.recompute = True
                strategy.recompute_configs = {"checkpoints": ["x"]}
                strategy.save_to_prototxt("dist_strategy.prototxt")
        """
        with open(output, "w") as fout:
            fout.write(str(self.strategy))
    def load_from_prototxt(self, pb_file):
        """
        Load from prototxt file for DistributedStrategy initialization
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.load_from_prototxt("dist_strategy.prototxt")
        """
        with open(pb_file, 'r') as f:
            self.strategy = google.protobuf.text_format.Merge(
-                str(f.read()), self.strategy)
+                str(f.read()), self.strategy
+            )
    @property
    def execution_strategy(self):
@@ -188,7 +193,6 @@ class DistributedStrategy(object):
        Configure ExecutionStrategy for DistributedStrategy
        Examples:
            .. code-block:: python
                import paddle
@@ -199,12 +203,16 @@ class DistributedStrategy(object):
                strategy = paddle.distributed.fleet.DistributedStrategy()
                strategy.execution_strategy = exe_strategy
        """
        execution_strategy = paddle.fluid.ExecutionStrategy()
        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
        for f in fields:
-            setattr(execution_strategy, f.name,
+            setattr(
-                    getattr(self.strategy.execution_strategy, f.name))
+                execution_strategy,
+                f.name,
+                getattr(self.strategy.execution_strategy, f.name),
+            )
        return execution_strategy
    @execution_strategy.setter
@@ -212,18 +220,21 @@ class DistributedStrategy(object):
    def execution_strategy(self, strategy):
        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
        for f in fields:
-            setattr(self.strategy.execution_strategy, f.name,
+            setattr(
-                    getattr(strategy, f.name))
+                self.strategy.execution_strategy,
+                f.name,
+                getattr(strategy, f.name),
+            )
    @property
    def build_strategy(self):
        """
        Configure BuildStrategy for DistributedStrategy
        Note that the properties of BuildStrategy are valid in DistributedStrategy
        only if the property is non-distributed strategy.
        Examples:
            .. code-block:: python
                import paddle
@@ -239,6 +250,7 @@ class DistributedStrategy(object):
                strategy = paddle.distributed.fleet.DistributedStrategy()
                strategy.build_strategy = build_strategy
        """
        build_strategy = paddle.fluid.BuildStrategy()
@@ -261,41 +273,48 @@ class DistributedStrategy(object):
                    value = ReduceStrategyFleet(value)
                setattr(self.strategy.build_strategy, f.name, value)
            elif f.label == 3:  # repeated field
-                getattr(self.strategy.build_strategy,
+                getattr(self.strategy.build_strategy, f.name).extend(
-                        f.name).extend(getattr(strategy, f.name))
+                    getattr(strategy, f.name)
+                )
    @property
    def gradient_scale_configs(self):
        """
        Set the strategy of gradient scale
-        Examples:
+        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
        Note that, strategy must be in 'avg', 'sum' or 'customized'
        """
        return get_msg_dict(self.strategy.gradient_scale_configs)
    @gradient_scale_configs.setter
    @is_strict_auto
    def gradient_scale_configs(self, config):
-        check_configs_key(self.strategy.gradient_scale_configs, config,
+        check_configs_key(
-                          'gradient_scale_configs')
+            self.strategy.gradient_scale_configs,
+            config,
+            'gradient_scale_configs',
+        )
        assign_configs_value(self.strategy.gradient_scale_configs, config)
    @property
    def a_sync(self):
        """
        Indicating whether we are using asynchronous stocastic gradient descent updates
        for training. This property is valid when we are using parameter server training,
        which is implied by setting approperate RoleMaker
        Default value: True
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -307,6 +326,7 @@ class DistributedStrategy(object):
                # code block for defining loss and local optimizer
                # sgd = fleet.distributed_optimizer(optimizer, strategy)
        """
        return self.strategy.a_sync
@@ -318,12 +338,15 @@ class DistributedStrategy(object):
            self.a_sync_configs = {"k_steps": 0}
        else:
            raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}"
+                "The type of `flag` is invalid, expected type is bool, but received {}".format(
-                .format(type(flag)))
+                    type(flag)
+                )
+            )
    @property
    def a_sync_configs(self):
        """
        Set a_sync update configurations. In general, asynchronous parameter server
        training has serveral configurable settings that can be configured through
        a dict.
@@ -344,7 +367,6 @@ class DistributedStrategy(object):
            runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -365,13 +387,15 @@ class DistributedStrategy(object):
    @a_sync_configs.setter
    @is_strict_auto
    def a_sync_configs(self, configs):
-        check_configs_key(self.strategy.a_sync_configs, configs,
+        check_configs_key(
-                          "a_sync_configs")
+            self.strategy.a_sync_configs, configs, "a_sync_configs"
+        )
        assign_configs_value(self.strategy.a_sync_configs, configs)
    @property
    def trainer_desc_configs(self):
        """
        Set trainer desc configurations.
        **Notes**:
@@ -384,7 +408,6 @@ class DistributedStrategy(object):
            stat_var_names(list(str)):
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -404,11 +427,11 @@ class DistributedStrategy(object):
    @property
    def adam_d2sum(self):
        """
        set adam_d2sum
        Default value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -420,6 +443,7 @@ class DistributedStrategy(object):
                # code block for defining loss and local optimizer
                # sgd = fleet.distributed_optimizer(optimizer, strategy)
        """
        return self.strategy.adam_d2sum
@@ -430,27 +454,37 @@ class DistributedStrategy(object):
            self.strategy.adam_d2sum = flag
        else:
            raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}"
+                "The type of `flag` is invalid, expected type is bool, but received {}".format(
-                .format(type(flag)))
+                    type(flag)
+                )
+            )
    @trainer_desc_configs.setter
    @is_strict_auto
    def trainer_desc_configs(self, configs):
-        check_configs_key(self.strategy.trainer_desc_configs, configs,
+        check_configs_key(
-                          "trainer_desc_configs")
+            self.strategy.trainer_desc_configs, configs, "trainer_desc_configs"
+        )
        assign_configs_value(self.strategy.trainer_desc_configs, configs)
    @property
    def fs_client_param(self):
        """
        Set fs client configurations.
-        **Notes**:
+        Note:
            uri(str): the uri of fs client
            user(str): the user_name of fs client
            passwd(str): the passwd of fs client
            hadoop_bin(str):
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                role_maker = fleet.PaddleCloudRoleMaker()
                fleet.init(role_maker)
@@ -459,14 +493,16 @@ class DistributedStrategy(object):
                strategy.fs_client_param = configs
                # code block for defining loss and local optimizer
                # sgd = fleet.distributed_optimizer(optimizer, strategy)
        """
        return self.strategy.fs_client_param
    @fs_client_param.setter
    @is_strict_auto
    def fs_client_param(self, configs):
-        check_configs_key(self.strategy.fs_client_param, configs,
+        check_configs_key(
-                          "fs_client_param")
+            self.strategy.fs_client_param, configs, "fs_client_param"
+        )
        assign_configs_value(self.strategy.fs_client_param, configs)
    @property
@@ -477,6 +513,7 @@ class DistributedStrategy(object):
    @is_strict_auto
    def sparse_table_configs(self, configs):
        from google.protobuf.descriptor import FieldDescriptor
        table_param = self.strategy.downpour_table_param
        def set_table_config(msg, config_name, configs, index=0):
@@ -493,8 +530,9 @@ class DistributedStrategy(object):
                            data = getattr(msg, field.name).add()
                            set_table_config(data, name, configs, i)
                    else:
-                        set_table_config(getattr(msg, field.name), name,
+                        set_table_config(
-                                         configs)
+                            getattr(msg, field.name), name, configs
+                        )
                else:
                    # print("not message:", name)
                    if name not in configs:
@@ -513,133 +551,206 @@ class DistributedStrategy(object):
            for table_name in configs:
                table_data = table_param.add()
                table_data.table_name = table_name
-                set_table_config(table_data, "table_parameters." + table_name,
+                set_table_config(
-                                 configs[table_name])
+                    table_data,
+                    "table_parameters." + table_name,
+                    configs[table_name],
+                )
    @sparse_table_configs.setter
    def fleet_desc_configs(self, configs):
-        support_sparse_key_list = ['sparse_table_class', 'sparse_compress_in_save', 'sparse_shard_num', \
+        support_sparse_key_list = [
-                                   'sparse_accessor_class', 'sparse_learning_rate', 'sparse_initial_g2sum', 'sparse_initial_range', \
+            'sparse_table_class',
-                                   'sparse_weight_bounds', 'sparse_fea_dim', 'sparse_embedx_dim', 'sparse_embedx_threshold', 'sparse_nonclk_coeff', \
+            'sparse_compress_in_save',
-                                   'sparse_click_coeff', 'sparse_base_threshold', 'sparse_delta_threshold', 'sparse_delta_keep_days', \
+            'sparse_shard_num',
-                                   'sparse_delete_after_unseen_days', 'sparse_show_click_decay_rate', 'sparse_delete_threshold', \
+            'sparse_accessor_class',
-                                   'sparse_converter', 'sparse_deconverter', 'sparse_enable_cache', 'sparse_cache_rate', \
+            'sparse_learning_rate',
-                                   'sparse_cache_file_num', 'sparse_beta1_decay_rate', 'sparse_beta2_decay_rate', \
+            'sparse_initial_g2sum',
-                                   'sparse_ada_epsilon', 'sparse_optimizer', 'sparse_ssd_unseenday_threshold',
+            'sparse_initial_range',
-                                   'embed_sparse_optimizer', 'embed_sparse_learning_rate', 'embed_sparse_weight_bounds', \
+            'sparse_weight_bounds',
-                                   'embed_sparse_initial_range', 'embed_sparse_initial_g2sum', 'embed_sparse_beta1_decay_rate', \
+            'sparse_fea_dim',
-                                   'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \
+            'sparse_embedx_dim',
-                                   'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \
+            'sparse_embedx_threshold',
-                                   'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot']
+            'sparse_nonclk_coeff',
+            'sparse_click_coeff',
+            'sparse_base_threshold',
+            'sparse_delta_threshold',
+            'sparse_delta_keep_days',
+            'sparse_delete_after_unseen_days',
+            'sparse_show_click_decay_rate',
+            'sparse_delete_threshold',
+            'sparse_converter',
+            'sparse_deconverter',
+            'sparse_enable_cache',
+            'sparse_cache_rate',
+            'sparse_cache_file_num',
+            'sparse_beta1_decay_rate',
+            'sparse_beta2_decay_rate',
+            'sparse_ada_epsilon',
+            'sparse_optimizer',
+            'sparse_ssd_unseenday_threshold',
+            'embed_sparse_optimizer',
+            'embed_sparse_learning_rate',
+            'embed_sparse_weight_bounds',
+            'embed_sparse_initial_range',
+            'embed_sparse_initial_g2sum',
+            'embed_sparse_beta1_decay_rate',
+            'embed_sparse_beta2_decay_rate',
+            'embedx_sparse_optimizer',
+            'embedx_sparse_learning_rate',
+            'embedx_sparse_weight_bounds',
+            'embedx_sparse_initial_range',
+            'embedx_sparse_initial_g2sum',
+            'embedx_sparse_beta1_decay_rate',
+            'embedx_sparse_beta2_decay_rate',
+            'feature_learning_rate',
+            'nodeid_slot',
+        ]
        support_sparse_table_class = ['DownpourSparseTable']
        support_sparse_accessor_class = [
-            'DownpourSparseValueAccessor', 'DownpourCtrAccessor',
+            'DownpourSparseValueAccessor',
-            'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
+            'DownpourCtrAccessor',
-            'DownpourDoubleUnitAccessor', 'DownpourCtrDymfAccessor'
+            'DownpourCtrDoubleAccessor',
+            'DownpourUnitAccessor',
+            'DownpourDoubleUnitAccessor',
+            'DownpourCtrDymfAccessor',
        ]
        from google.protobuf.descriptor import FieldDescriptor
        table_param = self.strategy.downpour_table_param
        def add_graph_config(graph, strategy):
-            graph.feature_learning_rate = strategy.get('feature_learning_rate',
+            graph.feature_learning_rate = strategy.get(
-                                                       0.05)
+                'feature_learning_rate', 0.05
+            )
            graph.nodeid_slot = strategy.get('nodeid_slot', 9008)
        def sparse_optimizer_config(sgd, strategy, prefix):
-            optimizer_name = strategy.get(prefix + "sparse_optimizer",
+            optimizer_name = strategy.get(
-                                          "adagrad")
+                prefix + "sparse_optimizer", "adagrad"
+            )
            sgd.name = optimizer_name
            if optimizer_name == "naive":
                sgd.name = "SparseNaiveSGDRule"
                sgd.naive.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                sgd.naive.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
+                )
-                                      [-10, 10])
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                sgd.naive.weight_bounds.extend(bounds)
            elif optimizer_name == "adagrad":
                sgd.name = 'SparseAdaGradSGDRule'
                sgd.adagrad.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                sgd.adagrad.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                if prefix == "embed_":
                    sgd.adagrad.initial_range = 0
                sgd.adagrad.initial_g2sum = strategy.get(
-                    prefix + 'sparse_initial_g2sum', 3)
+                    prefix + 'sparse_initial_g2sum', 3
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
+                )
-                                      [-10, 10])
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                sgd.adagrad.weight_bounds.extend(bounds)
            elif optimizer_name == "std_adagrad":
                sgd.name = 'StdAdaGradSGDRule'
                sgd.adagrad.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                sgd.adagrad.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                if prefix == "embed_":
                    sgd.adagrad.initial_range = 0
                sgd.adagrad.initial_g2sum = strategy.get(
-                    prefix + 'sparse_initial_g2sum', 3)
+                    prefix + 'sparse_initial_g2sum', 3
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
+                )
-                                      [-10, 10])
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                sgd.adagrad.weight_bounds.extend(bounds)
            elif optimizer_name == "adam":
                sgd.name = 'SparseAdamSGDRule'
                sgd.adam.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.001)
+                    prefix + 'sparse_learning_rate', 0.001
+                )
                sgd.adam.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                sgd.adam.beta1_decay_rate = strategy.get(
-                    prefix + 'sparse_beta1_decay_rate', 0.9)
+                    prefix + 'sparse_beta1_decay_rate', 0.9
+                )
                sgd.adam.beta2_decay_rate = strategy.get(
-                    prefix + 'sparse_beta2_decay_rate', 0.999)
+                    prefix + 'sparse_beta2_decay_rate', 0.999
+                )
                sgd.adam.ada_epsilon = strategy.get(
-                    prefix + 'sparse_ada_epsilon', 1e-8)
+                    prefix + 'sparse_ada_epsilon', 1e-8
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
+                )
-                                      [-10, 10])
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                sgd.adam.weight_bounds.extend(bounds)
            elif optimizer_name == "shared_adam":
                sgd.name = 'SparseSharedAdamSGDRule'
                sgd.adam.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.001)
+                    prefix + 'sparse_learning_rate', 0.001
+                )
                sgd.adam.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                sgd.adam.beta1_decay_rate = strategy.get(
-                    prefix + 'sparse_beta1_decay_rate', 0.9)
+                    prefix + 'sparse_beta1_decay_rate', 0.9
+                )
                sgd.adam.beta2_decay_rate = strategy.get(
-                    prefix + 'sparse_beta2_decay_rate', 0.999)
+                    prefix + 'sparse_beta2_decay_rate', 0.999
+                )
                sgd.adam.ada_epsilon = strategy.get(
-                    prefix + 'sparse_ada_epsilon', 1e-8)
+                    prefix + 'sparse_ada_epsilon', 1e-8
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
+                )
-                                      [-10, 10])
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                sgd.adam.weight_bounds.extend(bounds)
        def set_sparse_table_config(table_data, config):
            for key in config:
                if key not in support_sparse_key_list:
                    raise ValueError("strategy key '%s' not support" % (key))
-            table_class = config.get("sparse_table_class",
+            table_class = config.get(
-                                     "DownpourSparseTable")
+                "sparse_table_class", "DownpourSparseTable"
+            )
            if table_class not in support_sparse_table_class:
                raise ValueError(
                    "support sparse_table_class: ['DownpourSparseTable'], but actual %s"
-                    % (table_class))
+                    % (table_class)
+                )
            table_data.table_class = 'MemorySparseTable'
            table_data.shard_num = config.get('sparse_shard_num', 1000)
            table_data.enable_sparse_table_cache = config.get(
-                'sparse_enable_cache', True)
+                'sparse_enable_cache', True
+            )
            table_data.sparse_table_cache_rate = config.get(
-                'sparse_cache_rate', 0.00055)
+                'sparse_cache_rate', 0.00055
+            )
            table_data.sparse_table_cache_file_num = config.get(
-                'sparse_cache_file_num', 16)
+                'sparse_cache_file_num', 16
+            )
-            accessor_class = config.get("sparse_accessor_class",
+            accessor_class = config.get(
-                                        "DownpourCtrAccessor")
+                "sparse_accessor_class", "DownpourCtrAccessor"
+            )
            if accessor_class not in support_sparse_accessor_class:
                raise ValueError(
                    "support sparse_accessor_class: ['DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'], but actual %s"
-                    % (accessor_class))
+                    % (accessor_class)
+                )
            if accessor_class.find("Double") >= 0:
                table_data.accessor.accessor_class = 'CtrDoubleAccessor'
@@ -654,7 +765,8 @@ class DistributedStrategy(object):
            table_data.accessor.embedx_dim = config.get('sparse_embedx_dim', 8)
            table_data.accessor.fea_dim = table_data.accessor.embedx_dim + 3
            table_data.accessor.embedx_threshold = config.get(
-                'sparse_embedx_threshold', 10)
+                'sparse_embedx_threshold', 10
+            )
            if accessor_class == 'DownpourUnitAccessor':
                table_data.accessor.ctr_accessor_param.show_scale = False
@@ -662,23 +774,32 @@ class DistributedStrategy(object):
                table_data.accessor.ctr_accessor_param.show_scale = True
            table_data.accessor.ctr_accessor_param.nonclk_coeff = config.get(
-                'sparse_nonclk_coeff', 0.1)
+                'sparse_nonclk_coeff', 0.1
+            )
            table_data.accessor.ctr_accessor_param.click_coeff = config.get(
-                'sparse_click_coeff', 1)
+                'sparse_click_coeff', 1
+            )
            table_data.accessor.ctr_accessor_param.base_threshold = config.get(
-                'sparse_base_threshold', 1.5)
+                'sparse_base_threshold', 1.5
+            )
            table_data.accessor.ctr_accessor_param.delta_threshold = config.get(
-                'sparse_delta_threshold', 0.25)
+                'sparse_delta_threshold', 0.25
+            )
            table_data.accessor.ctr_accessor_param.delta_keep_days = config.get(
-                'sparse_delta_keep_days', 16)
+                'sparse_delta_keep_days', 16
-            table_data.accessor.ctr_accessor_param.show_click_decay_rate = config.get(
+            )
-                'sparse_show_click_decay_rate', 0.98)
+            table_data.accessor.ctr_accessor_param.show_click_decay_rate = (
-            table_data.accessor.ctr_accessor_param.delete_threshold = config.get(
+                config.get('sparse_show_click_decay_rate', 0.98)
-                'sparse_delete_threshold', 0.8)
+            )
-            table_data.accessor.ctr_accessor_param.delete_after_unseen_days = config.get(
+            table_data.accessor.ctr_accessor_param.delete_threshold = (
-                'sparse_delete_after_unseen_days', 30)
+                config.get('sparse_delete_threshold', 0.8)
-            table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = config.get(
+            )
-                'sparse_ssd_unseenday_threshold', 1)
+            table_data.accessor.ctr_accessor_param.delete_after_unseen_days = (
+                config.get('sparse_delete_after_unseen_days', 30)
+            )
+            table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = (
+                config.get('sparse_ssd_unseenday_threshold', 1)
+            )
            converter = config.get('sparse_converter', "")
            deconverter = config.get('sparse_deconverter', "")
@@ -692,23 +813,33 @@ class DistributedStrategy(object):
            save_data2.converter = converter
            save_data2.deconverter = deconverter
-            if accessor_class == 'DownpourCtrAccessor' or accessor_class == 'DownpourCtrDoubleAccessor':
+            if (
-                sparse_optimizer_config(table_data.accessor.embed_sgd_param,
+                accessor_class == 'DownpourCtrAccessor'
-                                        config, '')
+                or accessor_class == 'DownpourCtrDoubleAccessor'
-                sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
+            ):
-                                        config, '')
+                sparse_optimizer_config(
+                    table_data.accessor.embed_sgd_param, config, ''
+                )
+                sparse_optimizer_config(
+                    table_data.accessor.embedx_sgd_param, config, ''
+                )
            else:
-                sparse_optimizer_config(table_data.accessor.embed_sgd_param,
+                sparse_optimizer_config(
-                                        config, 'embed_')
+                    table_data.accessor.embed_sgd_param, config, 'embed_'
-                sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
+                )
-                                        config, 'embedx_')
+                sparse_optimizer_config(
+                    table_data.accessor.embedx_sgd_param, config, 'embedx_'
+                )
            add_graph_config(table_data.accessor.graph_sgd_param, config)
        if not configs:
            print("fleet desc config is empty")
        else:
            for table_name in configs:
-                if table_name == 'dense_table' or table_name == 'datanorm_table':
+                if (
+                    table_name == 'dense_table'
+                    or table_name == 'datanorm_table'
+                ):
                    continue
                if type(configs[table_name]) != dict:
                    continue
@@ -744,6 +875,7 @@ class DistributedStrategy(object):
    @property
    def amp_configs(self):
        """
        Set automatic mixed precision training configurations. In general, amp has serveral configurable
        settings that can be configured through a dict.
@@ -772,7 +904,6 @@ class DistributedStrategy(object):
                   Default True. Only takes effect when `use_pure_fp16` is turned on.
        Examples 1:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -783,7 +914,6 @@ class DistributedStrategy(object):
                    "custom_white_list": ['conv2d']}
        Examples 2:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -794,6 +924,7 @@ class DistributedStrategy(object):
                    "init_loss_scaling": 32768,
                    "use_pure_fp16": True
                }
        """
        return get_msg_dict(self.strategy.amp_configs)
@@ -806,11 +937,11 @@ class DistributedStrategy(object):
    @property
    def asp(self):
        """
        Indicating whether we are using automatic sparsity training
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -835,7 +966,6 @@ class DistributedStrategy(object):
        Default value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -843,22 +973,24 @@ class DistributedStrategy(object):
                strategy.recompute = True
                # suppose x and y are names of checkpoint tensors for recomputation
                strategy.recompute_configs = {"checkpoints": ["x", "y"]}
        """
        return self.strategy.recompute
    @property
    def sync_nccl_allreduce(self):
        """
        Indicating whether we are using synchronized all reduce in each communication thread
        We note that system overhead is usually lower when sync_nccl_allreduce = True
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.sync_nccl_allreduce = True
        """
        return self.strategy.sync_nccl_allreduce
@@ -873,17 +1005,18 @@ class DistributedStrategy(object):
    @property
    def use_hierarchical_allreduce(self):
        """
        Indicating whether we are using hierarchical allreduce in collective communication
        Hierarchical allreduce often does allreduce within a certain node group and then do
        allreduce among the leaders of each group
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.use_hierarchical_allreduce = True
        """
        return self.strategy.use_hierarchical_allreduce
@@ -900,16 +1033,17 @@ class DistributedStrategy(object):
    @property
    def hierarchical_allreduce_inter_nranks(self):
        """
        Number of ranks for low level node groups in hierarchical allreduce
        Default value: number of GPU cards on each single GPU machine
        Example:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.hierarchical_allreduce_inter_nranks = 8
        """
        return self.strategy.hierarchical_allreduce_inter_nranks
@@ -926,17 +1060,18 @@ class DistributedStrategy(object):
    @property
    def sync_batch_norm(self):
        """
        Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
        Default value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.sync_batch_norm = True
        """
        return self.strategy.sync_batch_norm
@@ -952,16 +1087,17 @@ class DistributedStrategy(object):
    @property
    def fuse_all_reduce_ops(self):
        """
        Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
        Default value: True
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.fuse_all_reduce_ops = False
        """
        return self.strategy.fuse_all_reduce_ops
@@ -976,17 +1112,18 @@ class DistributedStrategy(object):
    @property
    def fuse_grad_size_in_MB(self):
        """
        Specifying the size of gradient to fuse in Mega-Bytes
        Default value: 32
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.fuse_grad_size_in_MB = 50
        """
        return self.strategy.fuse_grad_size_in_MB
@@ -1001,6 +1138,7 @@ class DistributedStrategy(object):
    @property
    def last_comm_group_size_MB(self):
        """
        Specifying the size of gradient to fuse in Mega-Bytes when
        the last group of each batch communicates. Making the last group
        small is useful to improve performance.
@@ -1013,6 +1151,7 @@ class DistributedStrategy(object):
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.last_comm_group_size_MB = 2
        """
        return self.strategy.last_comm_group_size_MB
@@ -1027,18 +1166,19 @@ class DistributedStrategy(object):
    @property
    def find_unused_parameters(self):
        """
        Indicating whether we are using find_unused_parameters to
        find unused parameters in DataParallel.
        Default value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.find_unused_parameters = True
        """
        return self.strategy.find_unused_parameters
@@ -1070,17 +1210,18 @@ class DistributedStrategy(object):
    @property
    def nccl_comm_num(self):
        """
        Specifying the number of NCCL communicator
        Default value: 1
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.nccl_comm_num = 2
        """
        return self.strategy.nccl_comm_num
@@ -1104,6 +1245,7 @@ class DistributedStrategy(object):
    @property
    def recompute_configs(self):
        """
        Set recompute configurations.
        **Note**:
@@ -1120,7 +1262,6 @@ class DistributedStrategy(object):
        specific here should be determined ("-1" is not allowed).
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1137,13 +1278,15 @@ class DistributedStrategy(object):
    @recompute_configs.setter
    @is_strict_auto
    def recompute_configs(self, configs):
-        check_configs_key(self.strategy.recompute_configs, configs,
+        check_configs_key(
-                          "checkpoint_configs")
+            self.strategy.recompute_configs, configs, "checkpoint_configs"
+        )
        assign_configs_value(self.strategy.recompute_configs, configs)
    @property
    def sharding(self):
        """
        Indicating whether we are using sharding Optimizer for memory
        optimization. We implement the sharding optimizer following the ZeRO-DP
        idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
@@ -1154,12 +1297,12 @@ class DistributedStrategy(object):
        Default value: False
        Examples:
            .. code-block:: python
                import paddle.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.sharding = True
        """
        return self.strategy.sharding
@@ -1174,6 +1317,7 @@ class DistributedStrategy(object):
    @property
    def sharding_configs(self):
        """
        Set sharding configurations.
        **Note**:
@@ -1211,7 +1355,6 @@ class DistributedStrategy(object):
        Examples:
            .. code-block:: python
                # sharding-DP, 2 nodes with 8 gpus per node
@@ -1225,23 +1368,25 @@ class DistributedStrategy(object):
                    "dp_degree": 2,
                    "gradient_merge_acc_step": 4,
                    }
        """
        return get_msg_dict(self.strategy.sharding_configs)
    @sharding_configs.setter
    @is_strict_auto
    def sharding_configs(self, configs):
-        check_configs_key(self.strategy.sharding_configs, configs,
+        check_configs_key(
-                          "sharding_configs")
+            self.strategy.sharding_configs, configs, "sharding_configs"
+        )
        assign_configs_value(self.strategy.sharding_configs, configs)
    @property
    def without_graph_optimization(self):
        """
        Run program using Executor other than ParallelExecutor.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1264,14 +1409,18 @@ class DistributedStrategy(object):
    @property
    def _calc_comm_same_stream(self):
        """
        This based on raw_program_optimizer program
        Set whether use same stream for calc and comm when fuse allreduce
        The default value for the calc_comm_same_stream is False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.calc_comm_same_stream = True
        """
        return self.strategy.calc_comm_same_stream
@@ -1288,14 +1437,18 @@ class DistributedStrategy(object):
    @property
    def fuse_grad_merge(self):
        """
        Set whether fuse the grad for gradient merge.
        Note: this flag will only effect the gradient merge under pipeline mode
        The default value for the fuse_grad_merge is False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.fuse_param_grad = True
        """
        return self.strategy.fuse_grad_merge
@@ -1310,12 +1463,17 @@ class DistributedStrategy(object):
    @property
    def fuse_grad_size_in_num(self):
        """
        This based on raw_program_optimizer program and allreduce the num of the fused op
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.fuse_grad_size_in_num = 2
        """
        return self.strategy.fuse_grad_size_in_num
@@ -1332,13 +1490,13 @@ class DistributedStrategy(object):
    @property
    def pipeline(self):
        """
        Indicating whether we are using pipeline parallelism for distributed training.
        Current implementation mainly focus on single GPU machine pipeline parallelism and
        data parallelism across GPU machine. The pipeline information is indicated through
        device_guard information in user-defined program.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1383,6 +1541,7 @@ class DistributedStrategy(object):
    @property
    def pipeline_configs(self):
        """
        Set pipeline parallelism configurations. In pipeline parallelism,
        different parts of neural networks are running on different GPUS.
        There are Tensor queue buffer between each pair of neighborhood GPUS
@@ -1398,7 +1557,6 @@ class DistributedStrategy(object):
            **micro_batch_size**: the number of small batches in each user defined batch
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1413,17 +1571,18 @@ class DistributedStrategy(object):
    @pipeline_configs.setter
    @is_strict_auto
    def pipeline_configs(self, configs):
-        check_configs_key(self.strategy.pipeline_configs, configs,
+        check_configs_key(
-                          "pipeline_configs")
+            self.strategy.pipeline_configs, configs, "pipeline_configs"
+        )
        assign_configs_value(self.strategy.pipeline_configs, configs)
    @property
    def tensor_parallel(self):
        """
        Indicating whether we are using tensor parallel for distributed training.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1444,16 +1603,18 @@ class DistributedStrategy(object):
    @property
    def tensor_parallel_configs(self):
        """
        Set tensor_parallel configurations.
        **Notes**:
            **Detailed arguments for tensor_parallel_configs**
            **tensor_parallel_degree**: degree of tensor parallel
            **tensor_init_seed**: parameter initialization random seed
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1468,54 +1629,62 @@ class DistributedStrategy(object):
    @tensor_parallel_configs.setter
    @is_strict_auto
    def tensor_parallel_configs(self, configs):
-        check_configs_key(self.strategy.tensor_parallel_configs, configs,
+        check_configs_key(
-                          "tensor_parallel_configs")
+            self.strategy.tensor_parallel_configs,
+            configs,
+            "tensor_parallel_configs",
+        )
        assign_configs_value(self.strategy.tensor_parallel_configs, configs)
    @property
    def hybrid_configs(self):
        """
        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
        needs to meet the following relationships
        total_number_GPUs = dp_degree * mp_degree * pp_degree
        **Note**:
-            dp_degree(int): set number of GPUs in a data parallel group. Default -1.
+            **dp_degree(int)**: set number of GPUs in a data parallel group. Default -1.
                                    This value should be an integer greater than 0.
                                    If it is not set, or set to -1, its value will be inferred
                                    based on the total number of cards.
-            mp_degree(int): set number of GPUs in a model parallel group. Default 1
-            pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
+            **mp_degree(int)**: set number of GPUs in a model parallel group. Default 1
+            **pp_degree(int)**: set number of GPUs in a pipeline parallel group. Default 1
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.hybrid_configs = {
                    "dp_degree": 1,
                    "mp_degree": 2,
                    "pp_degree": 1}
        """
        return get_msg_dict(self.strategy.hybrid_configs)
    @hybrid_configs.setter
    def hybrid_configs(self, configs):
-        check_configs_key(self.strategy.hybrid_configs, configs,
+        check_configs_key(
-                          "hybrid_configs")
+            self.strategy.hybrid_configs, configs, "hybrid_configs"
+        )
        assign_configs_value(self.strategy.hybrid_configs, configs)
    @property
    def localsgd(self):
        """
        Indicating whether we are using Local SGD training. Default Value: False
        For more details, please refer to
        `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1536,6 +1705,7 @@ class DistributedStrategy(object):
    @property
    def localsgd_configs(self):
        """
        Set LocalSGD training configurations. LocalSGD has a configurable
        setting that can be configured through a dict.
@@ -1544,7 +1714,6 @@ class DistributedStrategy(object):
            begin_step(int) The step of beginning training by localsgd. Default 1.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1552,6 +1721,7 @@ class DistributedStrategy(object):
                strategy.localsgd = True
                strategy.localsgd_configs = {"k_steps": 4,
                                            "begin_step": 30}
        """
        return get_msg_dict(self.strategy.localsgd_configs)
@@ -1559,20 +1729,20 @@ class DistributedStrategy(object):
    @localsgd_configs.setter
    @is_strict_auto
    def localsgd_configs(self, configs):
-        check_configs_key(self.strategy.localsgd_configs, configs,
+        check_configs_key(
-                          "localsgd_configs")
+            self.strategy.localsgd_configs, configs, "localsgd_configs"
+        )
        assign_configs_value(self.strategy.localsgd_configs, configs)
    @property
    def adaptive_localsgd(self):
        """
        Indicating whether we are using Adaptive Local SGD training. Default Value: False
        For more details, please refer to `Adaptive Communication Strategies to Achieve
        the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1593,6 +1763,7 @@ class DistributedStrategy(object):
    @property
    def adaptive_localsgd_configs(self):
        """
        Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
        setting that can be configured through a dict.
@@ -1600,10 +1771,10 @@ class DistributedStrategy(object):
            init_k_steps(int) The initial steps for training before adaptive localsgd.
                              Then, the adaptive localsgd method will modify init_k_steps automatically.
                              Default 1.
            begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1611,6 +1782,7 @@ class DistributedStrategy(object):
                strategy.adaptive_localsgd = True
                strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
                                                    "begin_step": 30}
        """
        return get_msg_dict(self.strategy.adaptive_localsgd_configs)
@@ -1618,20 +1790,23 @@ class DistributedStrategy(object):
    @adaptive_localsgd_configs.setter
    @is_strict_auto
    def adaptive_localsgd_configs(self, configs):
-        check_configs_key(self.strategy.adaptive_localsgd_configs, configs,
+        check_configs_key(
-                          "adaptive_localsgd_configs")
+            self.strategy.adaptive_localsgd_configs,
+            configs,
+            "adaptive_localsgd_configs",
+        )
        assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
    @property
    def dgc(self):
        """
        Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
        [Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1652,6 +1827,7 @@ class DistributedStrategy(object):
    @property
    def dgc_configs(self):
        r"""
        Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
        settings that can be configured through a dict.
@@ -1668,13 +1844,13 @@ class DistributedStrategy(object):
                    element will be transmitted.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.dgc = True
                strategy.dgc_configs = {"rampup_begin_step": 1252}
        """
        return get_msg_dict(self.strategy.dgc_configs)
@@ -1687,14 +1863,15 @@ class DistributedStrategy(object):
    @property
    def fp16_allreduce(self):
        """
        Indicating whether we are using fp16 gradient allreduce training
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.fp16_allreduce = True # by default this is false
@@ -1711,6 +1888,7 @@ class DistributedStrategy(object):
    @property
    def gradient_merge(self):
        """
        Gradient Merge, also called as Gradient Accumulation,
        is a strategy for large batch training. With this strategy,
        model parameter will not be updated until user-defined steps.
@@ -1721,13 +1899,13 @@ class DistributedStrategy(object):
        to model parameters.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.gradient_merge = True
                strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
        """
        return self.strategy.gradient_merge
@@ -1742,6 +1920,7 @@ class DistributedStrategy(object):
    @property
    def gradient_merge_configs(self):
        """
        the key-value configs of distribute_strategy
        **Note**:
@@ -1750,26 +1929,28 @@ class DistributedStrategy(object):
            avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.gradient_merge = True
                strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
        """
        return get_msg_dict(self.strategy.gradient_merge_configs)
    @gradient_merge_configs.setter
    @is_strict_auto
    def gradient_merge_configs(self, configs):
-        check_configs_key(self.strategy.gradient_merge_configs, configs,
+        check_configs_key(
-                          "gradient_configs")
+            self.strategy.gradient_merge_configs, configs, "gradient_configs"
+        )
        assign_configs_value(self.strategy.gradient_merge_configs, configs)
    @property
    def lars(self):
        """
        Set lars configurations. lars is used to deal with the convergence problems when the global
        batch size is larger than 8k.  For more details, please refer to
        [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
@@ -1777,12 +1958,12 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.lars = True # by default this is false
        """
        return self.strategy.lars
@@ -1797,6 +1978,7 @@ class DistributedStrategy(object):
    @property
    def lars_configs(self):
        """
        Set Lars training configurations.
        **Notes**:
@@ -1808,7 +1990,6 @@ class DistributedStrategy(object):
        will be exclude from weight decay in lars formula.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1820,6 +2001,7 @@ class DistributedStrategy(object):
                            "epsilon": 0,
                            "exclude_from_weight_decay": ['batch_norm', '.b_0']
                        }
        """
        return get_msg_dict(self.strategy.lars_configs)
@@ -1832,6 +2014,7 @@ class DistributedStrategy(object):
    @property
    def lamb(self):
        """
        Set lamb configurations. lamb is used to deal with the convergence problems for large
        batch size training, specially for attention-related model like BERT. For more details,
        please refer to
@@ -1840,12 +2023,12 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.lamb = True # by default this is false
        """
        return self.strategy.lamb
@@ -1861,6 +2044,7 @@ class DistributedStrategy(object):
    @property
    def lamb_configs(self):
        """
        Set Lars training configurations.
        **Notes**:
@@ -1869,7 +2053,6 @@ class DistributedStrategy(object):
        will be exclude from weight decay in lamb formula.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -1879,6 +2062,7 @@ class DistributedStrategy(object):
                        'lamb_weight_decay': 0.01,
                        'exclude_from_weight_decay': [],
                    }
        """
        return get_msg_dict(self.strategy.lamb_configs)
@@ -1891,8 +2075,10 @@ class DistributedStrategy(object):
    @property
    def elastic(self):
        """
        Indicating whether we want to do current distributed training on clusters with elastic resources.
        Currently, this is configuration is not valid.
        """
        return self.strategy.elastic
@@ -1907,6 +2093,7 @@ class DistributedStrategy(object):
    @property
    def auto(self):
        """
        Indicating whether we are using auto-parallel configuration
        This feature is currently an experimental feature. Currently,
        auto-parallelism can be used only when a user does not set any other
@@ -1915,7 +2102,6 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle
@@ -1929,6 +2115,7 @@ class DistributedStrategy(object):
                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
                optimizer = fleet.distributed_optimizer(optimizer, strategy)
        """
        return self.strategy.auto
@@ -1942,6 +2129,7 @@ class DistributedStrategy(object):
    @property
    def semi_auto(self):
        """
        Indicating whether we are using semi-auto parallel function
        This feature is currently an experimental feature. Currently,
        auto-parallelism can be used only when a user does not set any other
@@ -1950,7 +2138,6 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle
@@ -1964,6 +2151,7 @@ class DistributedStrategy(object):
                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
                optimizer = fleet.distributed_optimizer(optimizer, strategy)
        """
        return self.strategy.semi_auto
@@ -1977,16 +2165,21 @@ class DistributedStrategy(object):
    @property
    def auto_search(self):
        """
        Indicating whether we are using auto-search parallel function
        For details, please reference the following code example
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle
                paddle.enable_static()
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.auto_search = True
        """
        return self.strategy.auto_search
@@ -2000,15 +2193,20 @@ class DistributedStrategy(object):
    @property
    def split_data(self):
        """
        Indicating whether we split the data. If True, we split the data.
        Default Value: True
        Examples:
            .. code-block:: python
                import paddle
                paddle.enable_static()
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.split_data = True
        """
        return self.strategy.split_data
@@ -2022,8 +2220,10 @@ class DistributedStrategy(object):
    @property
    def qat(self):
        """
        Indicating whether we are using quantization training
        Default Value: False
        """
        return self.strategy.qat
@@ -2037,6 +2237,7 @@ class DistributedStrategy(object):
    @property
    def qat_configs(self):
        """
        Set quantization training configurations. In general, qat has serveral configurable
        settings that can be configured through a dict.
@@ -2053,10 +2254,10 @@ class DistributedStrategy(object):
            algo(str): Other quantization training algorithm.
        Exampless:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.qat = True
                strategy.qat_configs = {
@@ -2076,13 +2277,13 @@ class DistributedStrategy(object):
    @property
    def heter_ccl_mode(self):
        """
        Indicating whether we are using heter_ccl_mode for model training.
        This feature is currently an experimental feature. Currently,
        heter_ccl_mode can be used only for dataparallel with dygraph mode.
        Default Value: False
        Examples:
            .. code-block:: python
                import paddle
@@ -2094,6 +2295,7 @@ class DistributedStrategy(object):
                # for initialize parallel env, only need to call
                paddle.distributed.init_parallel_env()
                # then the heterogenous context will be created.
        """
        return self.strategy.heter_ccl_mode
@@ -2107,6 +2309,7 @@ class DistributedStrategy(object):
    @property
    def cudnn_exhaustive_search(self):
        """
        Indicating whether to use exhaustive search method to choose convolution algorithms.
        Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
        This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
@@ -2114,17 +2317,18 @@ class DistributedStrategy(object):
        Default Value: True
        Examples:
            .. code-block:: python
                import paddle
                paddle.enable_static()
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.cudnn_exhaustive_search = False
                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
                optimizer = fleet.distributed_optimizer(optimizer, strategy)
        """
        return self.strategy.cudnn_exhaustive_search
@@ -2141,6 +2345,7 @@ class DistributedStrategy(object):
    @property
    def conv_workspace_size_limit(self):
        """
        The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
        The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
        Usually, large workspace size may lead to choose faster algorithms,
@@ -2148,12 +2353,12 @@ class DistributedStrategy(object):
        Default Value: 4000
        Examples:
            .. code-block:: python
                import paddle
                paddle.enable_static()
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.conv_workspace_size_limit = 1024
@@ -2176,17 +2381,18 @@ class DistributedStrategy(object):
    @property
    def cudnn_batchnorm_spatial_persistent(self):
        """
        Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
        This is only useful in cudnn.
        Default Value: True
        Examples:
            .. code-block:: python
                import paddle
                paddle.enable_static()
                import paddle.distributed.fleet as fleet
                strategy = fleet.DistributedStrategy()
                strategy.cudnn_batchnorm_spatial_persistent = True
@@ -2244,7 +2450,8 @@ class DistributedStrategy(object):
        h1_format = "    " + "|{{:^{}s}}|\n".format(length)
        h2_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
-            max_k, " " * spacing, max_v)
+            max_k, " " * spacing, max_v
+        )
        border = "    +" + "".join(["="] * length) + "+"
        line = "    +" + "".join(["-"] * length) + "+"
@@ -2269,37 +2476,48 @@ class DistributedStrategy(object):
                        if getattr(self.strategy, f.name):
                            draws += border + "\n"
                            draws += h1_format.format(
-                                "{}=True <-> {}_configs".format(f.name, f.name))
+                                "{}=True <-> {}_configs".format(f.name, f.name)
+                            )
                            draws += line + "\n"
-                            my_configs = getattr(self.strategy,
+                            my_configs = getattr(
-                                                 f.name + "_configs")
+                                self.strategy, f.name + "_configs"
+                            )
                            config_fields = my_configs.DESCRIPTOR.fields
                            for ff in config_fields:
                                if isinstance(
-                                        getattr(my_configs,
+                                    getattr(my_configs, ff.name),
-                                                ff.name), google.protobuf.pyext.
+                                    google.protobuf.pyext._message.RepeatedScalarContainer,
-                                        _message.RepeatedScalarContainer):
+                                ):
                                    values = getattr(my_configs, ff.name)
                                    for i, v in enumerate(values):
                                        if i == 0:
                                            draws += h2_format.format(
-                                                ff.name, str(v))
+                                                ff.name, str(v)
+                                            )
                                        else:
                                            draws += h2_format.format(
-                                                "", str(v))
+                                                "", str(v)
+                                            )
                                else:
                                    draws += h2_format.format(
                                        ff.name,
-                                        str(getattr(my_configs, ff.name)))
+                                        str(getattr(my_configs, ff.name)),
+                                    )
                    else:
                        env_draws += h2_format.format(
-                            f.name, str(getattr(self.strategy, f.name)))
+                            f.name, str(getattr(self.strategy, f.name))
+                        )
                else:
                    env_draws += h2_format.format(
-                        f.name, str(getattr(self.strategy, f.name)))
+                        f.name, str(getattr(self.strategy, f.name))
+                    )
-        result_res = draws + border + "\n" + h1_format.format(
+        result_res = (
-            "Environment Flags, Communication Flags")
+            draws
+            + border
+            + "\n"
+            + h1_format.format("Environment Flags, Communication Flags")
+        )
        result_res += env_draws
        build_strategy_str = border + "\n"
@@ -2309,7 +2527,8 @@ class DistributedStrategy(object):
        fields = self.strategy.build_strategy.DESCRIPTOR.fields
        for f in fields:
            build_strategy_str += h2_format.format(
-                f.name, str(getattr(self.strategy.build_strategy, f.name)))
+                f.name, str(getattr(self.strategy.build_strategy, f.name))
+            )
        build_strategy_str += border + "\n"
        execution_strategy_str = h1_format.format("Execution Strategy")
@@ -2318,7 +2537,8 @@ class DistributedStrategy(object):
        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
        for f in fields:
            execution_strategy_str += h2_format.format(
-                f.name, str(getattr(self.strategy.execution_strategy, f.name)))
+                f.name, str(getattr(self.strategy.execution_strategy, f.name))
+            )
        execution_strategy_str += border + "\n"
        result_res += build_strategy_str + execution_strategy_str

--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -28,12 +28,13 @@ _HYBRID_PARALLEL_GROUP = None
 class ParallelMode(object):
    """
    There are all the parallel modes currently supported:
        - DATA_PARALLEL: Distribute input data to different devices.
        - TENSOR_PARALLEL: Shards tensors in the network to different devices.
        - PIPELINE_PARALLEL: Place different layers of the network on different devices.
-    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states 
+        - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
-                         corresponding to the parameters to each device.
    Examples:
        .. code-block:: python
@@ -43,6 +44,7 @@ class ParallelMode(object):
            print(parallel_mode.DATA_PARALLEL)  # 0
    """
    DATA_PARALLEL = 0
    TENSOR_PARALLEL = 1
    PIPELINE_PARALLEL = 2
@@ -50,14 +52,16 @@ class ParallelMode(object):
 class CommunicateTopology(object):
+    def __init__(
-    def __init__(self,
+        self,
        hybrid_group_names=["data", "pipe", "sharding", "model"],
-                 dims=[1, 1, 1, 1]):
+        dims=[1, 1, 1, 1],
+    ):
        self._parallel_names = hybrid_group_names
        self._dims = dims
-        self.coordinate = collections.namedtuple('Coordinate',
+        self.coordinate = collections.namedtuple(
-                                                 self._parallel_names)
+            'Coordinate', self._parallel_names
+        )
        self._world_size = reduce(lambda x, y: x * y, self._dims)
        ranges = [range(d) for d in self._dims]
@@ -65,7 +69,8 @@ class CommunicateTopology(object):
        self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
        self._rank2coord = dict(
-            zip(self._coord2rank.values(), self._coord2rank.keys()))
+            zip(self._coord2rank.values(), self._coord2rank.keys())
+        )
    def get_hybrid_group_names(self):
        return self._parallel_names
@@ -90,7 +95,8 @@ class CommunicateTopology(object):
    def get_axis_list(self, axis_name, index):
        axis = self._parallel_names.index(axis_name)
        ranks = [
-            self._coord2rank[coord] for coord in self._coord2rank.keys()
+            self._coord2rank[coord]
+            for coord in self._coord2rank.keys()
            if coord[axis] == index
        ]
        ranks.sort()
@@ -132,7 +138,6 @@ class CommunicateTopology(object):
 class HybridCommunicateGroup(object):
    def __init__(self, topology):
        self.nranks = paddle.distributed.get_world_size()
        self.global_rank = paddle.distributed.get_rank()
@@ -148,10 +153,16 @@ class HybridCommunicateGroup(object):
        self._sharding_parallel_id = self._get_sharding_parallel_id()
        self.stage_id = self._get_pipe_parallel_id()
-        assert self._check_vaild_topo(
+        assert self._check_vaild_topo(), (
-        ), "Here is an unreasonable topogy setting. world_size: {}, but" \
+            "Here is an unreasonable topogy setting. world_size: {}, but"
-            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(self.nranks,
+            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(
-            self._mp_degree, self._sharding_degree, self._pp_degree, self._dp_degree)
+                self.nranks,
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
        # create comm group for data parallel
        self._dp_group, self._dp_comm_group = self._set_comm_group("data")
@@ -164,26 +175,43 @@ class HybridCommunicateGroup(object):
        # create comm group for sharding parallel
        self._sharding_group, self._sharding_comm_group = self._set_comm_group(
-            "sharding")
+            "sharding"
+        )
        # create global group for check inf_nan / clip global norm
        self._check_group, self._check_comm_group = self._set_check_group(
-            "data")
+            "data"
+        )
        # create p2p group
-        self.is_first_stage = (self.stage_id == 0)
+        self.is_first_stage = self.stage_id == 0
-        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+        self.is_last_stage = self.stage_id == (self._pp_degree - 1)
        # create p2p_groups
        if self._pp_degree > 1:
            self._set_p2p_group()
-        debug_str = "HybridParallelInfo: rank_id: %d, mp_degree: %d, " \
+        debug_str = (
-                    "sharding_degree: %d, pp_degree: %d, dp_degree: %d" % (self.global_rank, self._mp_degree,
+            "HybridParallelInfo: rank_id: %d, mp_degree: %d, "
-                    self._sharding_degree, self._pp_degree, self._dp_degree)
+            "sharding_degree: %d, pp_degree: %d, dp_degree: %d"
-        debug_str += ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s" % (
+            % (
-            self._mp_group, self._sharding_group, self._pp_group,
+                self.global_rank,
-            self._dp_group, self._check_group)
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
+        debug_str += (
+            ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s"
+            % (
+                self._mp_group,
+                self._sharding_group,
+                self._pp_group,
+                self._dp_group,
+                self._check_group,
+            )
+        )
        logger.info(debug_str)
        global _HYBRID_PARALLEL_GROUP
@@ -195,7 +223,12 @@ class HybridCommunicateGroup(object):
        # adding its parallel logic within that parallelism
        # when use sharding alone, it should have its own parallelism for its parallel logic
        # TODO modify 3 others parallel to support sharding
-        if self._mp_degree == 1 and self._pp_degree == 1 and self._dp_degree == 1 and self._sharding_degree > 1:
+        if (
+            self._mp_degree == 1
+            and self._pp_degree == 1
+            and self._dp_degree == 1
+            and self._sharding_degree > 1
+        ):
            return ParallelMode.SHARDING_PARALLEL
        elif self._mp_degree == 1 and self._pp_degree == 1:
            return ParallelMode.DATA_PARALLEL
@@ -206,7 +239,13 @@ class HybridCommunicateGroup(object):
            return ParallelMode.PIPELINE_PARALLEL
    def _check_vaild_topo(self):
-        return self._dp_degree * self._mp_degree * self._pp_degree * self._sharding_degree == self.nranks
+        return (
+            self._dp_degree
+            * self._mp_degree
+            * self._pp_degree
+            * self._sharding_degree
+            == self.nranks
+        )
    def _set_comm_group(self, parallel_method="data"):
        parallel_group = []
@@ -268,14 +307,16 @@ class HybridCommunicateGroup(object):
                    self.prev_rank = prev_rank
                next_group = paddle.distributed.new_group(
-                    ranks=[curr_rank, next_rank])
+                    ranks=[curr_rank, next_rank]
+                )
                if self.global_rank == curr_rank:
                    self.send_next_group = next_group
                elif self.global_rank == next_rank:
                    self.recv_prev_group = next_group
                prev_group = paddle.distributed.new_group(
-                    ranks=[prev_rank, curr_rank])
+                    ranks=[prev_rank, curr_rank]
+                )
                if self.global_rank == curr_rank:
                    self.send_prev_group = prev_group
@@ -339,7 +380,12 @@ class HybridCommunicateGroup(object):
        return self._pp_comm_group
    def get_p2p_groups(self):
-        return self.send_next_group, self.send_prev_group, self.recv_next_group, self.recv_prev_group
+        return (
+            self.send_next_group,
+            self.send_prev_group,
+            self.recv_next_group,
+            self.recv_prev_group,
+        )
    # sharding parallel message:
    def _get_sharding_parallel_id(self):
@@ -363,23 +409,25 @@ class HybridCommunicateGroup(object):
        return self._check_comm_group
    def get_rank_from_stage(self, stage_id, **kwargs):
-        return self._topo.get_rank_from_stage(self.global_rank,
+        return self._topo.get_rank_from_stage(
-                                              pipe=stage_id,
+            self.global_rank, pipe=stage_id, **kwargs
-                                              **kwargs)
+        )
 class _CommunicateGroup(object):
-    """ tmp for static """
+    """tmp for static"""
    def __init__(self):
        global _HYBRID_PARALLEL_GROUP
        _HYBRID_PARALLEL_GROUP = self
        self.groups = dict()
-    def set_comm_group(self, group_name, group_rank, group_size, ring_id,
+    def set_comm_group(
-                       group_ranks):
+        self, group_name, group_rank, group_size, ring_id, group_ranks
-        group = paddle.distributed.collective.Group(group_rank, ring_id,
+    ):
-                                                    group_ranks)
+        group = paddle.distributed.collective.Group(
+            group_rank, ring_id, group_ranks
+        )
        self.groups[group_name] = group
    def get_group(self, group_name):

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -103,6 +103,7 @@ def _check_var_exists(var_name):
 def init_parallel_env():
    """
    Initialize parallel training environment in dynamic graph mode.
    Note:
@@ -118,6 +119,7 @@ def init_parallel_env():
    Examples:
        .. code-block:: python
            # required: gpu
            import paddle
            import paddle.nn as nn
@@ -158,6 +160,7 @@ def init_parallel_env():
            if __name__ == '__main__':
                dist.spawn(train)
    """
    # 0. get env & check world size

--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -51,61 +51,76 @@ __all__ = [
 def _check_normalization(norm):
    if norm not in ['forward', 'backward', 'ortho']:
        raise ValueError(
-            "Unexpected norm: {}. Norm should be forward, backward or ortho".
+            "Unexpected norm: {}. Norm should be forward, backward or ortho".format(
-            format(norm))
+                norm
+            )
+        )
 def _check_fft_n(n):
    if not isinstance(n, int):
        raise ValueError(
-            "Invalid FFT argument n({}), it shoule be an integer.".format(n))
+            "Invalid FFT argument n({}), it shoule be an integer.".format(n)
+        )
    if n <= 0:
        raise ValueError(
-            "Invalid FFT argument n({}), it should be positive.".format(n))
+            "Invalid FFT argument n({}), it should be positive.".format(n)
+        )
 def _check_fft_shape(x, s):
    ndim = x.ndim
    if not isinstance(s, Sequence):
        raise ValueError(
-            "Invaid FFT argument s({}), it should be a sequence of integers.")
+            "Invaid FFT argument s({}), it should be a sequence of integers."
+        )
    if len(s) > ndim:
        raise ValueError(
            "Length of FFT argument s should not be larger than the rank of input. "
-            "Received s: {}, rank of x: {}".format(s, ndim))
+            "Received s: {}, rank of x: {}".format(s, ndim)
+        )
    for size in s:
        if not isinstance(size, int) or size <= 0:
-            raise ValueError("FFT sizes {} contains invalid value ({})".format(
+            raise ValueError(
-                s, size))
+                "FFT sizes {} contains invalid value ({})".format(s, size)
+            )
 def _check_fft_axis(x, axis):
    ndim = x.ndim
    if not isinstance(axis, int):
        raise ValueError(
-            "Invalid FFT axis ({}), it shoule be an integer.".format(axis))
+            "Invalid FFT axis ({}), it shoule be an integer.".format(axis)
+        )
    if axis < -ndim or axis >= ndim:
        raise ValueError(
            "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
-                axis, ndim, ndim))
+                axis, ndim, ndim
+            )
+        )
 def _check_fft_axes(x, axes):
    ndim = x.ndim
    if not isinstance(axes, Sequence):
        raise ValueError(
-            "Invalid FFT axes ({}), it should be a sequence of integers.".
+            "Invalid FFT axes ({}), it should be a sequence of integers.".format(
-            format(axes))
+                axes
+            )
+        )
    if len(axes) > ndim:
        raise ValueError(
            "Length of fft axes should not be larger than the rank of input. "
-            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
+            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim)
+        )
    for axis in axes:
        if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
            raise ValueError(
-                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})"
+                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".format(
-                .format(axes, axis, ndim, ndim))
+                    axes, axis, ndim, ndim
+                )
+            )
 def _resize_fft_input(x, s, axes):
@@ -127,10 +142,12 @@ def _resize_fft_input(x, s, axes):
            slices.append((0, s[i]))
    if axes_to_slice:
-        x = paddle.slice(x,
+        x = paddle.slice(
+            x,
            axes_to_slice,
            starts=[item[0] for item in slices],
-                         ends=[item[1] for item in slices])
+            ends=[item[1] for item in slices],
+        )
    if axes_to_pad:
        padding_widths = [0] * (2 * ndim)
        for axis, pad in zip(axes_to_pad, paddings):
@@ -146,8 +163,9 @@ def _normalize_axes(x, axes):
 def _check_at_least_ndim(x, rank):
    if x.ndim < rank:
-        raise ValueError("The rank of the input ({}) should >= {}".format(
+        raise ValueError(
-            x.ndim, rank))
+            "The rank of the input ({}) should >= {}".format(x.ndim, rank)
+        )
 # public APIs 1d
@@ -197,13 +215,9 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
    """
    if is_integer(x) or is_floating_point(x):
-        return fft_r2c(x,
+        return fft_r2c(
-                       n,
+            x, n, axis, norm, forward=True, onesided=False, name=name
-                       axis,
+        )
-                       norm,
-                       forward=True,
-                       onesided=False,
-                       name=name)
    else:
        return fft_c2c(x, n, axis, norm, forward=True, name=name)
@@ -266,13 +280,9 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
    """
    if is_integer(x) or is_floating_point(x):
-        return fft_r2c(x,
+        return fft_r2c(
-                       n,
+            x, n, axis, norm, forward=False, onesided=False, name=name
-                       axis,
+        )
-                       norm,
-                       forward=False,
-                       onesided=False,
-                       name=name)
    else:
        return fft_c2c(x, n, axis, norm, forward=False, name=name)
@@ -536,13 +546,9 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
    """
    if is_integer(x) or is_floating_point(x):
-        return fftn_r2c(x,
+        return fftn_r2c(
-                        s,
+            x, s, axes, norm, forward=True, onesided=False, name=name
-                        axes,
+        )
-                        norm,
-                        forward=True,
-                        onesided=False,
-                        name=name)
    else:
        return fftn_c2c(x, s, axes, norm, forward=True, name=name)
@@ -608,19 +614,16 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
            #          (-0.1666666716337204+0.28867512941360474j)]])
    """
    if is_integer(x) or is_floating_point(x):
-        return fftn_r2c(x,
+        return fftn_r2c(
-                        s,
+            x, s, axes, norm, forward=False, onesided=False, name=name
-                        axes,
+        )
-                        norm,
-                        forward=False,
-                        onesided=False,
-                        name=name)
    else:
        return fftn_c2c(x, s, axes, norm, forward=False, name=name)
 def rfftn(x, s=None, axes=None, norm="backward", name=None):
    """
    The N dimensional FFT for real input.
    This function computes the N-dimensional discrete Fourier Transform over
@@ -665,10 +668,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
            refer to :ref:`api_guide_Name` .
    Returns:
-        out(Tensor): complex tensor
+        out(Tensor), complex tensor
    Examples:
        .. code-block:: python
            import paddle
@@ -914,13 +916,17 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
    if s is not None:
        if not isinstance(s, Sequence) or len(s) != 2:
            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                .format(s))
+                    s
+                )
+            )
    if axes is not None:
        if not isinstance(axes, Sequence) or len(axes) != 2:
            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
-                .format(axes))
+                    axes
+                )
+            )
    return fftn(x, s, axes, norm, name)
@@ -979,13 +985,17 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
    if s is not None:
        if not isinstance(s, Sequence) or len(s) != 2:
            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                .format(s))
+                    s
+                )
+            )
    if axes is not None:
        if not isinstance(axes, Sequence) or len(axes) != 2:
            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
-                .format(axes))
+                    axes
+                )
+            )
    return ifftn(x, s, axes, norm, name)
@@ -1038,13 +1048,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
    if s is not None:
        if not isinstance(s, Sequence) or len(s) != 2:
            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                .format(s))
+                    s
+                )
+            )
    if axes is not None:
        if not isinstance(axes, Sequence) or len(axes) != 2:
            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
-                .format(axes))
+                    axes
+                )
+            )
    return rfftn(x, s, axes, norm, name)
@@ -1090,13 +1104,17 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
    if s is not None:
        if not isinstance(s, Sequence) or len(s) != 2:
            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                .format(s))
+                    s
+                )
+            )
    if axes is not None:
        if not isinstance(axes, Sequence) or len(axes) != 2:
            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
-                .format(axes))
+                    axes
+                )
+            )
    return irfftn(x, s, axes, norm, name)
@@ -1135,13 +1153,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
    if s is not None:
        if not isinstance(s, Sequence) or len(s) != 2:
            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                .format(s))
+                    s
+                )
+            )
    if axes is not None:
        if not isinstance(axes, Sequence) or len(axes) != 2:
            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
-                .format(axes))
+                    axes
+                )
+            )
    return hfftn(x, s, axes, norm, name)
@@ -1187,13 +1209,17 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
    if s is not None:
        if not isinstance(s, Sequence) or len(s) != 2:
            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                .format(s))
+                    s
+                )
+            )
    if axes is not None:
        if not isinstance(axes, Sequence) or len(axes) != 2:
            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
-                .format(axes))
+                    axes
+                )
+            )
    return ihfftn(x, s, axes, norm, name)
@@ -1417,10 +1443,9 @@ def fft_c2c(x, n, axis, norm, forward, name):
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(dtype)
        outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
+        helper.append_op(
-                         inputs=inputs,
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
-                         outputs=outputs,
+        )
-                         attrs=attrs)
    return out
@@ -1442,8 +1467,16 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
    if in_dygraph_mode():
        out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
    elif _in_legacy_dygraph():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+        attrs = (
-                 'onesided', onesided)
+            'axes',
+            axes,
+            'normalization',
+            norm,
+            'forward',
+            forward,
+            'onesided',
+            onesided,
+        )
        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
    else:
        inputs = {
@@ -1458,12 +1491,12 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
+            _real_to_complex_dtype(dtype)
+        )
        outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
+        helper.append_op(
-                         inputs=inputs,
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
-                         outputs=outputs,
+        )
-                         attrs=attrs)
    return out
@@ -1491,8 +1524,16 @@ def fft_c2r(x, n, axis, norm, forward, name):
            out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
    elif _in_legacy_dygraph():
        if n is not None:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+            attrs = (
-                     'last_dim_size', n)
+                'axes',
+                axes,
+                'normalization',
+                norm,
+                'forward',
+                forward,
+                'last_dim_size',
+                n,
+            )
        else:
            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
@@ -1506,12 +1547,12 @@ def fft_c2r(x, n, axis, norm, forward, name):
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
+            _complex_to_real_dtype(dtype)
+        )
        outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
+        helper.append_op(
-                         inputs=inputs,
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
-                         outputs=outputs,
+        )
-                         attrs=attrs)
    return out
@@ -1539,8 +1580,10 @@ def fftn_c2c(x, s, axes, norm, forward, name):
        if s is not None:
            if len(s) != len(axes):
                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
-                    format(len(s), len(axes)))
+                        len(s), len(axes)
+                    )
+                )
            s = [s[i] for i in axes_argsoft]
    if s is not None:
@@ -1562,10 +1605,9 @@ def fftn_c2c(x, s, axes, norm, forward, name):
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(dtype)
        outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
+        helper.append_op(
-                         inputs=inputs,
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
-                         outputs=outputs,
+        )
-                         attrs=attrs)
    return out
@@ -1591,8 +1633,10 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
        if s is not None:
            if len(s) != len(axes):
                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
-                    format(len(s), len(axes)))
+                        len(s), len(axes)
+                    )
+                )
            s = [s[i] for i in axes_argsoft] + [s[-1]]
    if s is not None:
@@ -1604,8 +1648,16 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
    if in_dygraph_mode():
        out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
    elif _in_legacy_dygraph():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+        attrs = (
-                 'onesided', onesided)
+            'axes',
+            axes,
+            'normalization',
+            norm,
+            'forward',
+            forward,
+            'onesided',
+            onesided,
+        )
        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
    else:
        inputs = {
@@ -1620,12 +1672,12 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
+            _real_to_complex_dtype(dtype)
+        )
        outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
+        helper.append_op(
-                         inputs=inputs,
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
-                         outputs=outputs,
+        )
-                         attrs=attrs)
    return out
@@ -1654,8 +1706,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
        if s is not None:
            if len(s) != len(axes):
                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
-                    format(len(s), len(axes)))
+                        len(s), len(axes)
+                    )
+                )
            s = [s[i] for i in axes_argsoft] + [s[-1]]
    if s is not None:
@@ -1673,8 +1727,16 @@ def fftn_c2r(x, s, axes, norm, forward, name):
            out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
    elif _in_legacy_dygraph():
        if s:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+            attrs = (
-                     'last_dim_size', s[-1])
+                'axes',
+                axes,
+                'normalization',
+                norm,
+                'forward',
+                forward,
+                'last_dim_size',
+                s[-1],
+            )
        else:
            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
        out = getattr(_legacy_C_ops, op_type)(x, *attrs)
@@ -1688,10 +1750,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
+            _complex_to_real_dtype(dtype)
+        )
        outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
+        helper.append_op(
-                         inputs=inputs,
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
-                         outputs=outputs,
+        )
-                         attrs=attrs)
    return out
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -23,9 +23,9 @@ from ...log_helper import get_logger
 __all__ = ['add_supported_layer']
-_logger = get_logger(__name__,
+_logger = get_logger(
-                     logging.INFO,
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+)
 def _default_pruning(weight_nparray, m, n, func_name, param_name):
@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
    exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
    if exlude_cond_shape2:
        _logger.warning(
-            '{} is not pruned because the first dimension of {} is smaller than {}'
+            '{} is not pruned because the first dimension of {} is smaller than {}'.format(
-            .format(param_name, shape, m))
+                param_name, shape, m
+            )
+        )
        return weight_pruned_nparray, weight_sparse_mask
    if exlude_cond_shape4:
        _logger.warning(
-            '{} is not pruned because the second dimension of {} is smaller than {}'
+            '{} is not pruned because the second dimension of {} is smaller than {}'.format(
-            .format(param_name, shape, m))
+                param_name, shape, m
+            )
+        )
        return weight_pruned_nparray, weight_sparse_mask
    checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight
    # matrices beforce invoking create_mask. Then we transpose the result mask to make
    # sure its shape to be the same as the input weight.
-    weight_sparse_mask = sparsity.create_mask(weight_nparray.T,
+    weight_sparse_mask = sparsity.create_mask(
-                                              func_name=func_name,
+        weight_nparray.T, func_name=func_name, n=n, m=m
-                                              n=n,
+    ).T
-                                              m=m).T
    weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
-    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+    assert sparsity.check_sparsity(
-                    'Pruning {} weight matrix failure!!!'.format(param_name)
+        weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name
+    ), 'Pruning {} weight matrix failure!!!'.format(param_name)
    return weight_pruned_nparray, weight_sparse_mask
@@ -78,6 +82,7 @@ supported_layers_and_prune_func_map = {}
 def add_supported_layer(layer, pruning_func=None):
    r"""
    Add supported layers and its corresponding pruning function.
    Args:
@@ -87,19 +92,25 @@ def add_supported_layer(layer, pruning_func=None):
        pruning_func (function, optional): a function type which receives five argument (weight_nparray,
                                           m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
                                           m, n, and func_name, please see `prune_model` for details.
    """
    name = None
    if isinstance(layer, str):
        name = layer
    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            type(layer).__name__)
+            type(layer).__name__
+        )
    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            layer.__name__)
+            layer.__name__
+        )
    else:
-        assert "The type of layer should be string of Layer, but got {}!".format(
+        assert (
-            type(layer))
+            "The type of layer should be string of Layer, but got {}!".format(
+                type(layer)
+            )
+        )
    if pruning_func is None:
        pruning_func = _default_pruning
    _supported_layers_and_prune_func_map_lock.acquire()

--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -27,9 +27,16 @@ from itertools import permutations
 import threading
 __all__ = [
-    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'calculate_density',
-    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
+    'check_mask_1d',
-    'MaskAlgo', 'CheckMethod'
+    'get_mask_1d',
+    'check_mask_2d',
+    'get_mask_2d_greedy',
+    'get_mask_2d_best',
+    'create_mask',
+    'check_sparsity',
+    'MaskAlgo',
+    'CheckMethod',
 ]
@@ -76,8 +83,9 @@ class CheckMethod(Enum):
            CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
            # CheckMethod.CHECK_2D
        """
-        assert isinstance(mask_algo, MaskAlgo), \
+        assert isinstance(
-               "mask_algo should be MaskAlgo type"
+            mask_algo, MaskAlgo
+        ), "mask_algo should be MaskAlgo type"
        if mask_algo == MaskAlgo.MASK_1D:
            return CheckMethod.CHECK_1D
        else:
@@ -86,20 +94,25 @@ class CheckMethod(Enum):
 def calculate_density(x):
    r"""
    Return the density of the input tensor.
    Args:
        x (nparray): The input tensor.
    Returns:
-        float: The density of :attr:`x`.
+        float, The density of :attr:`x`.
    Examples:
        .. code-block:: python
            import paddle
            import numpy as np
            x = np.array([[0, 1, 3, 0],
                        [1, 1, 0, 1]])
            paddle.incubate.asp.calculate_density(x) # 0.625
    """
    x_flattened = x.flatten()
    return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
@@ -126,7 +139,7 @@ def _reshape_1d(mat, m):
    remainder = mat.shape[1] % m
    if mat.shape[1] % m > 0:
        mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
-        mat_padded[:, :mat.shape[1]] = mat
+        mat_padded[:, : mat.shape[1]] = mat
        shape = mat_padded.shape
        return mat_padded.reshape(-1, m), shape
    else:
@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m):
        min_order_indices = np.argsort(np.absolute(sub_mat))
        mask_flattern[i, min_order_indices[:n].tolist()] = 0
    mask_flattern = mask_flattern.reshape(shape)
-    mask[:, :] = mask_flattern[:, :mat.shape[1]]
+    mask[:, :] = mask_flattern[:, : mat.shape[1]]
    return mask
@@ -239,12 +252,12 @@ def _reshape_2d(mat, m):
    remainder_0 = mat.shape[0] % m
    remainder_1 = mat.shape[1] % m
-    new_shape = (mat.shape[0] if remainder_0 == 0 \
+    new_shape = (
-                 else mat.shape[0] + (m - remainder_0),
+        mat.shape[0] if remainder_0 == 0 else mat.shape[0] + (m - remainder_0),
-                 mat.shape[1] if remainder_1 == 0 \
+        mat.shape[1] if remainder_1 == 0 else mat.shape[1] + (m - remainder_1),
-                 else mat.shape[1] + (m - remainder_1))
+    )
    mat_padded = np.zeros(new_shape)
-    mat_padded[:mat.shape[0], :mat.shape[1]] = mat
+    mat_padded[: mat.shape[0], : mat.shape[1]] = mat
    mat_flattern = np.empty(new_shape).reshape(-1, m * m)
    curr_idx = 0
@@ -252,9 +265,9 @@ def _reshape_2d(mat, m):
        row_end = row_start + m
        for col_start in range(0, mat_padded.shape[1], m):
            col_end = col_start + m
-            sub_mat = np.squeeze(mat_padded[row_start:row_end, \
+            sub_mat = np.squeeze(
-                                            col_start:col_end] \
+                mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
-                                            .reshape(-1))
+            )
            mat_flattern[curr_idx] = sub_mat
            curr_idx += 1
    return mat_flattern, mat_padded.shape
@@ -304,8 +317,9 @@ def check_mask_2d(mat, n, m):
    mat_padded, shape = _reshape_2d(mat, m)
    for sub_mat in mat_padded:
        sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
-        if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
+        if (np.sum(np.sum(sub_mask, axis=1) > (m - n)) != 0) and (
-            (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
+            np.sum(np.sum(sub_mask, axis=0) > (m - n)) != 0
+        ):
            return False
    return True
@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m):
        sub_mask = np.squeeze(mask_padded[idx])
        min_order_1d_indices = np.argsort(sub_mat)
-        min_order_2d_indices = [(int(x / m), x % m)
+        min_order_2d_indices = [
-                                for x in min_order_1d_indices]
+            (int(x / m), x % m) for x in min_order_1d_indices
+        ]
        row_counter = collections.Counter()
        col_counter = collections.Counter()
        for i in range(len(min_order_1d_indices) - 1, -1, -1):
            matrix_entry = min_order_2d_indices[i]
-            if (row_counter[matrix_entry[0]] == n) or \
+            if (row_counter[matrix_entry[0]] == n) or (
-               (col_counter[matrix_entry[1]] == n):
+                col_counter[matrix_entry[1]] == n
+            ):
                continue
            sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m):
            col_end = col_start + m
            mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
            curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 _valid_2d_patterns_lock = threading.Lock()
@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m):
        patterns = patterns + patterns
        patterns = np.asarray(list(set(permutations(patterns, m))))
-        valid = ((patterns.sum(axis=1) <= n).sum(
+        valid = (
-            axis=1) == m).nonzero()[0].reshape(-1)
+            ((patterns.sum(axis=1) <= n).sum(axis=1) == m)
+            .nonzero()[0]
+            .reshape(-1)
+        )
        valid_patterns = np.empty((valid.shape[0], m, m))
        valid_patterns[:] = patterns[valid[:]]
@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m):
    mat_flattern, shape = _reshape_2d(mat, m)
    mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
-    pmax = np.argmax(np.matmul(mat_flattern,
+    pmax = np.argmax(
-                               patterns.reshape(patterns.shape[0], m * m).T),
+        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
-                     axis=1)
+        axis=1,
+    )
    mask_flattern[:] = patterns[pmax[:]]
    mask = np.empty(shape)
@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m):
            col_end = col_start + m
            mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
            curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
    dtype = tensor.dtype
    t = tensor.astype(float)
-    assert isinstance(func_name, MaskAlgo), \
+    assert isinstance(func_name, MaskAlgo), (
-           "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
+        "func_name argumet of create_mask is only accepted as type MaskAlgo. "
        "But got {}".format(type(func_name))
+    )
    func = getattr(sys.modules[__name__], func_name.value, None)
    if len(shape) == 1:
        t = t.reshape(1, shape[0])
@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
        t = t.reshape(shape[0] * shape[1], shape[2])
    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
    elif len(shape) == 4:
-        t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3],
+        t = t.transpose([0, 1, 3, 2]).reshape(
-                                              shape[2])
+            shape[0] * shape[1] * shape[3], shape[2]
+        )
        mask = func(t, n=n, m=m)
-        return mask.reshape([shape[0], shape[1], shape[3],
+        return (
-                             shape[2]]).transpose([0, 1, 3, 2]).astype(dtype)
+            mask.reshape([shape[0], shape[1], shape[3], shape[2]])
+            .transpose([0, 1, 3, 2])
+            .astype(dtype)
+        )
    else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+        raise ValueError(
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
    mask = func(t, n=n, m=m)
    return mask.reshape(shape).astype(dtype)
@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
    shape = tensor.shape
    t = tensor.astype(float)
-    assert type(func_name) == CheckMethod, \
+    assert type(func_name) == CheckMethod, (
-           "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
+        "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
        "But got {}".format(type(func_name))
+    )
    func = getattr(sys.modules[__name__], func_name.value, None)
    if len(shape) == 1:
        t = t.reshape(1, shape[0])
@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
        t = t.reshape(shape[0] * shape[1], shape[2])
    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
    elif len(shape) == 4:
-        t = t.transpose([0, 1, 3,
+        t = t.transpose([0, 1, 3, 2]).reshape(
-                         2]).reshape([shape[0] * shape[1] * shape[3], shape[2]])
+            [shape[0] * shape[1] * shape[3], shape[2]]
+        )
    else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+        raise ValueError(
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
    return func(t, n=n, m=m)
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -32,12 +32,25 @@ from . import parallel_helper
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
-from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
+from .layer_hooks import (
-from .base import program_desc_tracing_guard, param_guard, in_declarative_mode, _convert_into_variable
+    record_program_ops_pre_hook,
+    set_op_customized_attrs_post_hook,
+    LayerOpsRecoder,
+)
+from .base import (
+    program_desc_tracing_guard,
+    param_guard,
+    in_declarative_mode,
+    _convert_into_variable,
+)
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
-from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_, in_dygraph_mode
+from paddle.fluid.framework import (
+    _non_static_mode,
+    convert_np_dtype_to_dtype_,
+    in_dygraph_mode,
+)
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.core import VarDesc
@@ -67,7 +80,7 @@ def _addindent(string, indent):
 class HookRemoveHelper(object):
-    """ A HookRemoveHelper that can be used to remove hook. """
+    """A HookRemoveHelper that can be used to remove hook."""
    next_hook_id = 0
@@ -153,13 +166,14 @@ class Layer(object):
    def train(self):
        """
        Sets this Layer and all its sublayers to training mode.
        This only effects certain modules like `Dropout` and `BatchNorm`.
        Returns:
            None
-        Example::
+        Examples:
            .. code-block:: python
                import paddle
@@ -236,6 +250,7 @@ class Layer(object):
    def apply(self, fn):
        """
        Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``)
        as well as self. Typical use includes initializing the parameters of a model.
@@ -243,7 +258,7 @@ class Layer(object):
            fn (function): a function to be applied to each sublayer
        Returns:
-            Layer: self
+            Layer, self
        Example::
            .. code-block:: python
@@ -263,6 +278,7 @@ class Layer(object):
              net.apply(init_weights)
              print(net.state_dict())
        """
        for layer in self.children():
            layer.apply(fn)
@@ -272,10 +288,12 @@ class Layer(object):
        return self
    def full_name(self):
-        """Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
+        """
+        Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
        Returns:
-            str: full name of this layer.
+            str, full name of this layer.
        Example::
            .. code-block:: python
@@ -297,7 +315,9 @@ class Layer(object):
        return self._full_name
    def register_forward_post_hook(self, hook):
-        """Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
+        """
+        Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
        It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively.
        User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer.
@@ -308,7 +328,7 @@ class Layer(object):
            hook(function): a function registered as a forward post-hook
        Returns:
-            HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
+            HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
        Examples:
            .. code-block:: python
@@ -340,13 +360,16 @@ class Layer(object):
                # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
                assert (out0.numpy() == (out1.numpy()) * 2).any()
        """
        hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
        self._forward_post_hooks[hook_remove_helper._hook_id] = hook
        return hook_remove_helper
    def register_forward_pre_hook(self, hook):
-        """Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
+        """
+        Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
        It should have the following form, `input` of the `hook` is `input` of the `Layer`,
        hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if
@@ -359,7 +382,7 @@ class Layer(object):
            hook(function): a function registered as a forward pre-hook
        Returns:
-            HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
+            HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
        Examples:
            .. code-block:: python
@@ -398,12 +421,14 @@ class Layer(object):
        self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
        return hook_remove_helper
-    def create_parameter(self,
+    def create_parameter(
+        self,
        shape,
        attr=None,
        dtype=None,
        is_bias=False,
-                         default_initializer=None):
+        default_initializer=None,
+    ):
        """Create parameters for this layer.
        Parameters:
@@ -443,12 +468,15 @@ class Layer(object):
        temp_attr = copy.deepcopy(attr)
        if isinstance(temp_attr, six.string_types) and temp_attr == "":
            temp_attr = None
-        return self._helper.create_parameter(temp_attr, shape, dtype, is_bias,
+        return self._helper.create_parameter(
-                                             default_initializer)
+            temp_attr, shape, dtype, is_bias, default_initializer
+        )
-    @deprecated(since="2.0.0",
+    @deprecated(
+        since="2.0.0",
        update_to="paddle.nn.Layer.create_tensor",
-                reason="New api in create_tensor, easier to use.")
+        reason="New api in create_tensor, easier to use.",
+    )
    def create_variable(self, name=None, persistable=None, dtype=None):
        """
@@ -488,14 +516,16 @@ class Layer(object):
        if name is not None:
            var_name = ".".join([self._full_name, name])
        else:
-            var_name = unique_name.generate(".".join(
+            var_name = unique_name.generate(
-                [self._full_name, "_generated_var"]))
+                ".".join([self._full_name, "_generated_var"])
+            )
        return self._helper.main_program.current_block().create_var(
            name=var_name,
            persistable=persistable,
            dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )
    # TODO: Add more parameter list when we need them
    def create_tensor(self, name=None, persistable=None, dtype=None):
@@ -538,20 +568,24 @@ class Layer(object):
        if name is not None:
            var_name = ".".join([self._full_name, name])
        else:
-            var_name = unique_name.generate(".".join(
+            var_name = unique_name.generate(
-                [self._full_name, "_generated_var"]))
+                ".".join([self._full_name, "_generated_var"])
+            )
        return self._helper.main_program.current_block().create_var(
            name=var_name,
            persistable=persistable,
            dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )
    def parameters(self, include_sublayers=True):
-        """Returns a list of all Parameters from current layer and its sub-layers.
+        """
+        Returns a list of all Parameters from current layer and its sub-layers.
        Returns:
-            list of Tensor : a list of Parameters.
+            list of Tensor, a list of Parameters.
        Examples:
            .. code-block:: python
@@ -563,13 +597,17 @@ class Layer(object):
        """
        ret = [
-            param for _, param in self.named_parameters(
+            param
-                include_sublayers=include_sublayers)
+            for _, param in self.named_parameters(
+                include_sublayers=include_sublayers
+            )
        ]
        return ret
    def children(self):
-        """Returns an iterator over immediate children layers.
+        """
+        Returns an iterator over immediate children layers.
        Yields:
            Layer: a child layer
@@ -619,13 +657,15 @@ class Layer(object):
                yield name, layer
    def sublayers(self, include_self=False):
-        """Returns a list of sub layers.
+        """
+        Returns a list of sub layers.
        Parameters:
            include_self(bool, optional): Whether return self as sublayers. Default: False
        Returns:
-            list of Layer : a list of sub layers.
+            list of Layer, a list of sub layers.
        Examples:
            .. code-block:: python
@@ -678,9 +718,11 @@ class Layer(object):
        """
        params_set = set()
-        named_sublayers = self.named_sublayers(
+        named_sublayers = (
-            prefix=prefix, include_self=True) if include_sublayers else zip(
+            self.named_sublayers(prefix=prefix, include_self=True)
-                [prefix], [self])
+            if include_sublayers
+            else zip([prefix], [self])
+        )
        for layer_prefix, sublayer in named_sublayers:
            params = sublayer._parameters.items()
            for key, param in params:
@@ -724,9 +766,9 @@ class Layer(object):
            if layer is None:
                continue
            layer_prefix = prefix + ('.' if prefix else '') + key
-            for p, l in layer.named_sublayers(prefix=layer_prefix,
+            for p, l in layer.named_sublayers(
-                                              include_self=True,
+                prefix=layer_prefix, include_self=True, layers_set=layers_set
-                                              layers_set=layers_set):
+            ):
                yield p, l
    def register_buffer(self, name, tensor, persistable=True):
@@ -769,25 +811,32 @@ class Layer(object):
        if '_buffers' not in self.__dict__:
            raise ValueError(
-                "super(YourLayer, self).__init__() should be called first")
+                "super(YourLayer, self).__init__() should be called first"
+            )
        elif not isinstance(name, six.string_types):
            raise TypeError(
-                "The name of buffer should be a string, but received {}.".
+                "The name of buffer should be a string, but received {}.".format(
-                format(type(name).__name__))
+                    type(name).__name__
+                )
+            )
        elif '.' in name:
            raise KeyError(
                "The name of buffer can not contain `.`, "
                "because when you access the newly added buffer in the "
-                "form of `self.**.**`, it will cause AttributeError.")
+                "form of `self.**.**`, it will cause AttributeError."
+            )
        elif name == '':
            raise KeyError("The name of buffer can not be empty.")
        elif hasattr(self, name) and name not in self._buffers:
            raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not (type(tensor) == core.VarBase
+        elif tensor is not None and not (
-                                         or type(tensor) == core.eager.Tensor):
+            type(tensor) == core.VarBase or type(tensor) == core.eager.Tensor
+        ):
            raise TypeError(
-                "The registered buffer should be a Paddle.Tensor, but received {}."
+                "The registered buffer should be a Paddle.Tensor, but received {}.".format(
-                .format(type(tensor).__name__))
+                    type(tensor).__name__
+                )
+            )
        else:
            self._buffers[name] = tensor
            if persistable:
@@ -797,13 +846,14 @@ class Layer(object):
    def buffers(self, include_sublayers=True):
        """
        Returns a list of all buffers from current layer and its sub-layers.
        Parameters:
            include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True
        Returns:
-            list of Tensor : a list of buffers.
+            list of Tensor, a list of buffers.
        Examples:
            .. code-block:: python
@@ -820,8 +870,10 @@ class Layer(object):
        """
        ret = [
-            buffer for _, buffer in self.named_buffers(
+            buffer
-                include_sublayers=include_sublayers)
+            for _, buffer in self.named_buffers(
+                include_sublayers=include_sublayers
+            )
        ]
        return ret
@@ -862,9 +914,11 @@ class Layer(object):
        """
        buffers_set = set()
-        named_sublayers = self.named_sublayers(
+        named_sublayers = (
-            prefix=prefix, include_self=True) if include_sublayers else zip(
+            self.named_sublayers(prefix=prefix, include_self=True)
-                [prefix], [self])
+            if include_sublayers
+            else zip([prefix], [self])
+        )
        for layer_prefix, sublayer in named_sublayers:
            buffers = sublayer._buffers.items()
            for key, buffer in buffers:
@@ -910,7 +964,7 @@ class Layer(object):
            hook_result = forward_pre_hook(self, inputs)
            if hook_result is not None:
                if not isinstance(hook_result, tuple):
-                    hook_result = (hook_result, )
+                    hook_result = (hook_result,)
                inputs = hook_result
        if not self._built:
@@ -920,16 +974,20 @@ class Layer(object):
                # TODO(liuyuhui) Only xpu broadcast parameters here.
                # The other device is to call _sync_params_buffers in DataParallel
                # to realize the parameter synchronization among multiply cards.
-                if parallel_helper._is_data_parallel_mode(
+                if (
-                ) and paddle.is_compiled_with_xpu():
+                    parallel_helper._is_data_parallel_mode()
+                    and paddle.is_compiled_with_xpu()
+                ):
                    parallel_helper._broadcast_parameters(
-                        self._parameters.values())
+                        self._parameters.values()
+                    )
            self._built = True
        if in_profiler_mode():
-            with profiler.RecordEvent(self.__class__.__name__,
+            with profiler.RecordEvent(
-                                      profiler.TracerEventType.Forward):
+                self.__class__.__name__, profiler.TracerEventType.Forward
+            ):
                outputs = self.forward(*inputs, **kwargs)
        else:
            outputs = self.forward(*inputs, **kwargs)
@@ -942,8 +1000,14 @@ class Layer(object):
        return outputs
    def __call__(self, *inputs, **kwargs):
-        if (not in_declarative_mode()) and (not self._forward_pre_hooks) \
+        if (
-            and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode() and (not in_profiler_mode()):
+            (not in_declarative_mode())
+            and (not self._forward_pre_hooks)
+            and (not self._forward_post_hooks)
+            and (not self._built)
+            and in_dygraph_mode()
+            and (not in_profiler_mode())
+        ):
            self._build_once(*inputs, **kwargs)
            return self.forward(*inputs, **kwargs)
        else:
@@ -964,7 +1028,9 @@ class Layer(object):
        raise ValueError("Layer shouldn't implement backward")
    def add_sublayer(self, name, sublayer):
-        """Adds a sub Layer instance.
+        """
+        Adds a sub Layer instance.
        Added sublayer can be accessed by self.name
@@ -972,7 +1038,7 @@ class Layer(object):
            name(str): name of this sublayer.
            sublayer(Layer): an instance of Layer.
        Returns:
-            Layer: the sublayer passed in.
+            Layer, the sublayer passed in.
        Examples:
            .. code-block:: python
@@ -999,8 +1065,9 @@ class Layer(object):
                model = MySequential(fc1, fc2)
                for prefix, layer in model.named_sublayers():
                    print(prefix, layer)
        """
-        assert (isinstance(sublayer, Layer) or sublayer == None)
+        assert isinstance(sublayer, Layer) or sublayer == None
        self._sub_layers[name] = sublayer
        return sublayer
@@ -1014,7 +1081,7 @@ class Layer(object):
            name(str): name of this sublayer.
            parameter(Parameter): an instance of Parameter.
        Returns:
-            Parameter: the parameter passed in.
+            Parameter, the parameter passed in.
        Examples:
            .. code-block:: python
@@ -1037,32 +1104,42 @@ class Layer(object):
        """
        if '_parameters' not in self.__dict__:
            raise RuntimeError(
-                "super(YourLayer, self).__init__() should be called firstly.")
+                "super(YourLayer, self).__init__() should be called firstly."
+            )
        elif not isinstance(name, six.string_types):
            raise TypeError(
-                "The name of parameter should be a string, but received {}.".
+                "The name of parameter should be a string, but received {}.".format(
-                format(type(name).__name__))
+                    type(name).__name__
+                )
+            )
        elif '.' in name:
            raise KeyError(
                "The name of parameter can not contain `.`, "
                "because when you access the newly added parameter in the "
-                "form of `self.**.**`, it will cause AttributeError.")
+                "form of `self.**.**`, it will cause AttributeError."
+            )
        elif name == '':
            raise KeyError("The name of parameter can not be empty.")
        elif hasattr(self, name) and name not in self._parameters:
            raise KeyError("The parameter '{}' already exists.".format(name))
-        elif parameter is not None and not isinstance(parameter,
+        elif parameter is not None and not isinstance(
-                                                      framework.Parameter):
+            parameter, framework.Parameter
+        ):
            raise TypeError(
-                "The parameter to be added should be a Parameter, but received {}."
+                "The parameter to be added should be a Parameter, but received {}.".format(
-                .format(type(parameter).__name__))
+                    type(parameter).__name__
+                )
+            )
        else:
            if parameter is None:
                self._parameters[name] = None
            if len(self._loaddict_holder) > 0:
-                assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                assert (
-                    parameter.name)
+                    parameter.name in self._loaddict_holder
+                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    parameter.name
+                )
                parameter.set_value(self._loaddict_holder[parameter.name])
@@ -1081,37 +1158,50 @@ class Layer(object):
        """
        def is_already_registered(is_pre_hook):
-            layers_hooks = self._forward_pre_hooks if is_pre_hook else self._forward_post_hooks
+            layers_hooks = (
-            candidate_hook = record_program_ops_pre_hook if is_pre_hook else set_op_customized_attrs_post_hook
+                self._forward_pre_hooks
+                if is_pre_hook
+                else self._forward_post_hooks
+            )
+            candidate_hook = (
+                record_program_ops_pre_hook
+                if is_pre_hook
+                else set_op_customized_attrs_post_hook
+            )
            already_registed = False
            if layers_hooks:
                last_key = next(reversed(layers_hooks))
-                already_registed = (layers_hooks[last_key] == candidate_hook)
+                already_registed = layers_hooks[last_key] == candidate_hook
            return already_registed
        if not isinstance(attrs, dict):
            raise TypeError(
                "attrs should be type(dict), but received {}".format(
-                    type(attrs).__name__))
+                    type(attrs).__name__
+                )
+            )
        # NOTE: Overwrite behavior for same key.
        self._customized_attrs.update(attrs)
        if not is_already_registered(is_pre_hook=True):
            pre_hook_helper = self.register_forward_pre_hook(
-                record_program_ops_pre_hook)
+                record_program_ops_pre_hook
+            )
            assert len(self._op_recorder.hooks) == 0
            self._op_recorder.hooks = [pre_hook_helper]
        # manually register post_hook to ensure it is inserted into the head.
        if not is_already_registered(is_pre_hook=False):
            post_hook_helper = self.register_forward_post_hook(
-                set_op_customized_attrs_post_hook)
+                set_op_customized_attrs_post_hook
+            )
            if len(self._forward_post_hooks) > 1:
-                self._forward_post_hooks.move_to_end(post_hook_helper._hook_id,
+                self._forward_post_hooks.move_to_end(
-                                                     last=False)
+                    post_hook_helper._hook_id, last=False
+                )
            assert len(self._op_recorder.hooks) == 1
@@ -1144,7 +1234,6 @@ class Layer(object):
        return object.__getattribute__(self, name)
    def __setattr__(self, name, value):
        def _remove_if_exist(*dicts):
            for d in dicts:
                if name in d:
@@ -1156,10 +1245,14 @@ class Layer(object):
        if isinstance(value, framework.Parameter):
            if params is None:
                raise ValueError(
-                    "super(YourLayer, self).__init__() should be called first")
+                    "super(YourLayer, self).__init__() should be called first"
+                )
            if len(self._loaddict_holder) > 0:
-                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                assert (
-                    value.name)
+                    value.name in self._loaddict_holder
+                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    value.name
+                )
                value.set_value(self._loaddict_holder[value.name])
@@ -1168,9 +1261,10 @@ class Layer(object):
        elif params is not None and name in params:
            if value is not None:
                raise TypeError(
-                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'"
+                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'".format(
-                    .format(name,
+                        name, type(value).__name__
-                            type(value).__name__))
+                    )
+                )
            params[name] = None
        else:
            layers = self.__dict__.get('_sub_layers', None)
@@ -1185,9 +1279,10 @@ class Layer(object):
            elif layers is not None and name in layers:
                if value is not None:
                    raise TypeError(
-                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'"
+                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'".format(
-                        .format(name,
+                            name, type(value).__name__
-                                type(value).__name__))
+                        )
+                    )
                layers[name] = None
            else:
                _buffers = self.__dict__.get('_buffers', None)
@@ -1196,8 +1291,9 @@ class Layer(object):
                        raise ValueError(
                            "super(YourLayer, self).__init__() should be called first"
                        )
-                    _remove_if_exist(self.__dict__, self._parameters,
+                    _remove_if_exist(
-                                     self._sub_layers)
+                        self.__dict__, self._parameters, self._sub_layers
+                    )
                    # Set persistable=False by default. Only `register_buffer` can
                    # add a persistable buffer.
                    if name not in self._buffers:
@@ -1211,6 +1307,7 @@ class Layer(object):
                    # value via `assign`.
                    if type(value) == framework.Variable:
                        from paddle import assign
                        # Note(zhhsplendid): the condition below happens in PaddleGan model,
                        # but should all non-Variable _buffers[name] be re-assign? We
                        # should consider it in the future. I current wrote this as
@@ -1218,18 +1315,23 @@ class Layer(object):
                        if in_declarative_mode() and _buffers[name] is None:
                            raise RuntimeError(
                                'In Dy2stat, self.{0} is a buffer and self.{0} is '
-                                'not allowed to be set to Variable when self.{0} is None.'
+                                'not allowed to be set to Variable when self.{0} is None.'.format(
-                                .format(name))
+                                    name
-                        elif _buffers[name] is None or type(getattr(
+                                )
-                                self, name)) == core.VarBase:
+                            )
+                        elif (
+                            _buffers[name] is None
+                            or type(getattr(self, name)) == core.VarBase
+                        ):
                            _buffers[name] = assign(value)
                        else:
                            assign(value, getattr(self, name))
                    elif value is not None:
                        raise TypeError(
-                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
+                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'".format(
-                            .format(name,
+                                name, type(value).__name__
-                                    type(value).__name__))
+                            )
+                        )
                    else:
                        # Assigning None will remove the buffer, but if re-assign a new varBase to it,
                        # it will be remarked as a buffer with same `persistable` attribute.
@@ -1316,10 +1418,12 @@ class Layer(object):
        self._state_dict_hooks[hook_remove_helper._hook_id] = hook
        return hook_remove_helper
-    def _obtain_parameters_buffers(self,
+    def _obtain_parameters_buffers(
+        self,
        destination=None,
        include_sublayers=True,
-                                   structured_name_prefix=""):
+        structured_name_prefix="",
+    ):
        """
        The difference from state_dict() is that state_dict_hook will not be called,
        but the original types of parameters and buffers will be maintained.
@@ -1330,7 +1434,10 @@ class Layer(object):
            if data is not None:
                destination[structured_name_prefix + name] = data
        for name, buffer in self._buffers.items():
-            if buffer is not None and name not in self._non_persistable_buffer_names_set:
+            if (
+                buffer is not None
+                and name not in self._non_persistable_buffer_names_set
+            ):
                destination[structured_name_prefix + name] = buffer
        if include_sublayers:
@@ -1339,17 +1446,22 @@ class Layer(object):
                    destination_temp = destination.copy()
                    destination_temp.update(
                        layer_item._obtain_parameters_buffers(
-                            destination_temp, include_sublayers,
+                            destination_temp,
-                            structured_name_prefix + layer_name + "."))
+                            include_sublayers,
+                            structured_name_prefix + layer_name + ".",
+                        )
+                    )
                    destination = destination_temp
        return destination
-    def _state_dict_impl(self,
+    def _state_dict_impl(
+        self,
        destination=None,
        include_sublayers=True,
        structured_name_prefix="",
        include_non_persistable_buffer=False,
-                         use_hook=True):
+        use_hook=True,
+    ):
        """
        Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
@@ -1367,7 +1479,10 @@ class Layer(object):
                destination[structured_name_prefix + name] = data
        for name, buffer in self._buffers.items():
            if not include_non_persistable_buffer:
-                if buffer is not None and name not in self._non_persistable_buffer_names_set:
+                if (
+                    buffer is not None
+                    and name not in self._non_persistable_buffer_names_set
+                ):
                    destination[structured_name_prefix + name] = buffer
            else:
                if buffer is not None:
@@ -1379,9 +1494,13 @@ class Layer(object):
                    destination_temp = destination.copy()
                    destination_temp.update(
                        layer_item._state_dict_impl(
-                            destination_temp, include_sublayers,
+                            destination_temp,
+                            include_sublayers,
                            structured_name_prefix + layer_name + ".",
-                            include_non_persistable_buffer, use_hook))
+                            include_non_persistable_buffer,
+                            use_hook,
+                        )
+                    )
                    destination = destination_temp
        if use_hook:
            for state_dict_hook in self._state_dict_hooks.values():
@@ -1391,12 +1510,15 @@ class Layer(object):
        return destination
-    def to_static_state_dict(self,
+    def to_static_state_dict(
+        self,
        destination=None,
        include_sublayers=True,
        structured_name_prefix="",
-                             use_hook=True):
+        use_hook=True,
+    ):
        '''
        Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
        Parameters:
@@ -1405,7 +1527,7 @@ class Layer(object):
            use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True
        Retruns:
-            dict: a dict contains all the parameters and persistable buffers.
+            dict, a dict contains all the parameters and persistable buffers.
        Examples:
            .. code-block:: python
@@ -1423,13 +1545,16 @@ class Layer(object):
            include_sublayers=include_sublayers,
            structured_name_prefix=structured_name_prefix,
            include_non_persistable_buffer=True,
-            use_hook=use_hook)
+            use_hook=use_hook,
+        )
-    def state_dict(self,
+    def state_dict(
+        self,
        destination=None,
        include_sublayers=True,
        structured_name_prefix="",
-                   use_hook=True):
+        use_hook=True,
+    ):
        '''
        Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
@@ -1457,7 +1582,8 @@ class Layer(object):
            include_sublayers=include_sublayers,
            structured_name_prefix=structured_name_prefix,
            include_non_persistable_buffer=False,
-            use_hook=use_hook)
+            use_hook=use_hook,
+        )
    @framework.deprecate_stat_dict
    def set_state_dict(self, state_dict, use_structured_name=True):
@@ -1489,22 +1615,31 @@ class Layer(object):
            state = state_dict.get(key, None)
            if state is None:
                raise ValueError(
-                    "{} is not found in the provided dict.".format(key))
+                    "{} is not found in the provided dict.".format(key)
-            if (isinstance(state, dict) or isinstance(state, list)):
+                )
-                if (len(state) != len(param)):
+            if isinstance(state, dict) or isinstance(state, list):
-                    raise ValueError("{} receieves the length of {}, "
+                if len(state) != len(param):
+                    raise ValueError(
+                        "{} receieves the length of {}, "
                        "but the expected shape is {}".format(
-                                         key, len(state), len(param)))
+                            key, len(state), len(param)
+                        )
+                    )
                else:
                    return param, state
            else:
-                state_shape = state.shape() if inspect.ismethod(
+                state_shape = (
-                    state.shape) else state.shape
+                    state.shape()
+                    if inspect.ismethod(state.shape)
+                    else state.shape
+                )
                if list(state_shape) != list(param.shape):
                    raise ValueError(
-                        "{} receives a shape {}, but the expected shape is {}.".
+                        "{} receives a shape {}, but the expected shape is {}.".format(
-                        format(key, list(state_shape), list(param.shape)))
+                            key, list(state_shape), list(param.shape)
+                        )
+                    )
                return param, state
        matched_param_state = []
@@ -1541,8 +1676,10 @@ class Layer(object):
            executor = Executor(_get_device())._default_executor
            # restore parameter states
            core._create_loaded_parameter(
-                [param for param, state in matched_param_state], global_scope(),
+                [param for param, state in matched_param_state],
-                executor)
+                global_scope(),
+                executor,
+            )
            for param, state in matched_param_state:
                _set_var(param, state)
@@ -1594,11 +1731,13 @@ class Layer(object):
                #        [ 0.33960250,  0.96878713]])
        '''
-        return self._to_impl(device=device,
+        return self._to_impl(
+            device=device,
            dtype=dtype,
            blocking=blocking,
            include_sublayers=True,
-                             floating_only=False)
+            floating_only=False,
+        )
    def _apply(self, func, device, dtype, blocking, include_sublayers=True):
        if include_sublayers:
@@ -1612,8 +1751,9 @@ class Layer(object):
                if param.grad is not None:
                    with no_grad():
-                        grad_applied = func(param._grad_ivar(), device, dtype,
+                        grad_applied = func(
-                                            blocking)
+                            param._grad_ivar(), device, dtype, blocking
+                        )
        for key, buf in self._buffers.items():
            if buf is not None:
@@ -1637,12 +1777,14 @@ class Layer(object):
            # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
            # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
            waiting_alloc_memory = (
-                (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+                ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            )
            gpu_memory_available = core.gpu_memory_available()
            if gpu_memory_available < waiting_alloc_memory:
                # Copy param / Tensor to cpu
-                t_used = t._copy_to(paddle.CPUPlace(),
+                t_used = t._copy_to(
-                                    blocking)  # k-v type will error
+                    paddle.CPUPlace(), blocking
+                )  # k-v type will error
                # Release mem of t
                t.value().get_tensor()._clear()
            else:
@@ -1653,7 +1795,8 @@ class Layer(object):
        # 2. cast param / Tensor to dtype
        if dtype is not None and dtype != t_used.dtype:
            with paddle.fluid.framework._dygraph_place_guard(
-                    place=t_used.place):
+                place=t_used.place
+            ):
                t_casted = t_used.cast(dtype=dtype)
        else:
            t_casted = t_used
@@ -1671,12 +1814,14 @@ class Layer(object):
        return t
-    def _to_impl(self,
+    def _to_impl(
+        self,
        device=None,
        dtype=None,
        blocking=None,
        include_sublayers=True,
-                 floating_only=False):
+        floating_only=False,
+    ):
        '''
        Cast the parameters and buffers of Layer by the give device, dtype and blocking.
@@ -1705,20 +1850,28 @@ class Layer(object):
        if device is not None:
            if isinstance(device, str):
                device = paddle.device._convert_to_place(device)
-            elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
+            elif isinstance(
-                                     core.CUDAPinnedPlace, core.XPUPlace)):
+                device,
+                (
+                    core.CPUPlace,
+                    core.CUDAPlace,
+                    core.CUDAPinnedPlace,
+                    core.XPUPlace,
+                ),
+            ):
                pass
            else:
                raise ValueError(
                    "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
-                    + type(device).__name__)
+                    + type(device).__name__
+                )
        if blocking is None:
            blocking = True
        else:
            assert isinstance(
-                blocking,
+                blocking, bool
-                bool), "blocking value error, must be the True, False or None"
+            ), "blocking value error, must be the True, False or None"
        def transform(t, device, dtype, blocking):
            if floating_only and (not paddle.is_floating_point(t)):

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1352,12 +1352,13 @@ class ParameterMetaClass(VariableMetaClass):
 @six.add_metaclass(VariableMetaClass)
 class Variable(object):
    """
-    **Notes**:
-        **The constructor of Variable should not be invoked directly.**
-        **In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.**
+    Notes:
+        The constructor of Variable should not be invoked directly.
+        In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.
-        **In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data**
+        In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data.
    In Fluid, every input and output of an OP is a variable. In most
    cases, variables are used for holding different kinds of data or training
@@ -1514,12 +1515,13 @@ class Variable(object):
    def detach(self):
        """
        Returns a new Variable, detached from the current graph.
        It will share data with origin Variable and without tensor copy.
        In addition, the detached Variable doesn't provide gradient propagation.
        Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable.
        Examples:
            .. code-block:: python
@@ -1533,6 +1535,7 @@ class Variable(object):
                # create a detached Variable
                y = x.detach()
        """
        assert (
@@ -2085,6 +2088,7 @@ class Variable(object):
    @property
    def T(self):
        """
        Permute current Variable with its dimensions reversed.
        If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
@@ -2103,6 +2107,7 @@ class Variable(object):
                x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
                print(x_T_np.shape)
                # (5, 3, 2)
        """
        if len(self.shape) == 1:
            return self
@@ -2141,7 +2146,7 @@ class Variable(object):
        as ``out = assign(tensor)`` .
        Returns:
-            Variable: The cloned Variable.
+            Variable, The cloned Variable.
        Examples:
            .. code-block:: python
@@ -2171,6 +2176,7 @@ class Variable(object):
    def _set_error_clip(self, error_clip):
        """
        Set the error_clip.
        Args:
@@ -2178,11 +2184,13 @@ class Variable(object):
        Returns:
            None
        """
        self.error_clip = error_clip
    def _set_info(self, key, value):
        """
        Set key-value information for this variable.
        Args:
@@ -2191,6 +2199,7 @@ class Variable(object):
        Returns:
            None
        """
        if not hasattr(self, "_info"):
            self._info = {}
@@ -2198,6 +2207,7 @@ class Variable(object):
    def _get_info(self, key):
        """
        Get the information of this variable corresponding to key.
        Args:
@@ -2205,6 +2215,7 @@ class Variable(object):
        Returns:
            object
        """
        if hasattr(self, "_info") and key in self._info:
            return self._info[key]
@@ -2212,7 +2223,9 @@ class Variable(object):
    def _slice_indices(self, slice, length):
        """
        Reference implementation for the slice.indices method.
        """
        # Compute step and length as integers.
        step = 1 if slice.step is None else slice.step
@@ -2383,7 +2396,7 @@ class Variable(object):
                Default: None
        Returns:
-            Tensor: the value in given scope.
+            Tensor, the value in given scope.
        Examples:
            .. code-block:: python
@@ -2438,6 +2451,7 @@ class Variable(object):
    def set_value(self, value, scope=None):
        '''
        Set the value to the tensor in given scope.
        Args:
@@ -2477,6 +2491,7 @@ class Variable(object):
                    if var.persistable:
                        t_load = paddle.load(path+var.name+'.pdtensor')
                        var.set_value(t_load)
        '''
        # The 'framework' is a low-level module, and 'executor'
@@ -2547,10 +2562,11 @@ class Variable(object):
    def size(self):
        """
        Returns the number of elements for current Variable, which is a int64 Variable with shape [1]
        Returns:
-            Variable: the number of elements for current Variable
+            Variable, the number of elements for current Variable
        Examples:
            .. code-block:: python
@@ -2564,6 +2580,7 @@ class Variable(object):
                # get the number of elements of the Variable
                y = x.size()
        """
        output = self.block.create_var(
@@ -2578,23 +2595,27 @@ class Variable(object):
    def _set_attr(self, name, val):
        """
        Set the value of attribute by attribute's name.
        Args:
            name(str): the attribute name.
            val(int|str|list): the value of the attribute.
        """
        self._update_desc_attr(name, val)
    def _has_attr(self, name):
        """
        Whether this Variable has the attribute with the name `name` or not.
        Args:
            name(str): the attribute name.
        Returns:
-            bool: True if has this attribute.
+            bool, True if has this attribute.
        """
        return self.desc.has_attr(name)
@@ -2624,7 +2645,7 @@ class Variable(object):
            name(str): the attribute name.
        Returns:
-            int|str|list: The attribute value. The return value
+            int|str|list, The attribute value. The return value
            can be any valid attribute type.
        """
        return self.desc.attr(name)
@@ -3196,14 +3217,16 @@ class Operator(object):
    def input(self, name):
        r"""
        Get the input arguments according to the input parameter name.
        Args:
            name(str): The input parameter name.
        Returns:
-            list: return the list of argument names that associated with \
+            list, return the list of argument names that associated with \
                the specific parameter name.
        """
        return self.desc.input(name)

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,13 @@ from __future__ import print_function
 import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
-from ..framework import Variable, _non_static_mode, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Variable,
+    _non_static_mode,
+    _varbase_creator,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from .. import core
 from ..param_attr import ParamAttr
 from . import nn
@@ -33,22 +39,29 @@ __all__ = ['accuracy', 'auc']
 def accuracy(input, label, k=1, correct=None, total=None):
    """
    accuracy layer.
    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
    This function computes the accuracy using the input and label.
    If the correct label occurs in top k predictions, then correct will increment by one.
-    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+    Note:
+        the dtype of accuracy is determined by input. the input and label dtype can be different.
    Args:
        input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
            The shape is ``[sample_number, class_dim]`` .
        label(Tensor): The label of dataset.  Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
-        k(int): The top k predictions for each class will be checked. Data type is int64 or int32.
+        k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1.
-        correct(Tensor): The correct predictions count. A Tensor with type int64 or int32.
+        correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None.
-        total(Tensor): The total entries count. A tensor with type int64 or int32.
+        total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None.
    Returns:
-        Tensor: The correct rate. A Tensor with type float32.
+        Tensor, The correct rate. A Tensor with type float32.
    Examples:
        .. code-block:: python
            import numpy as np
            import paddle
            import paddle.static as static
@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
                        fetch_list=[result[0]])
            print(output)
            #[array([0.], dtype=float32)]
    """
    if _non_static_mode():
        if correct is None:
@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None):
            total = _varbase_creator(dtype="int32")
        _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k,
+        topk_out, topk_indices = _legacy_C_ops.top_k_v2(
-                                                        'sorted', False)
+            input, 'k', _k, 'sorted', False
-        _acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
+        )
-                                            correct, total)
+        _acc, _, _ = _legacy_C_ops.accuracy(
+            topk_out, topk_indices, label, correct, total
+        )
        return _acc
    helper = LayerHelper("accuracy", **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
+    check_variable_and_dtype(
-                             'accuracy')
+        input, 'input', ['float16', 'float32', 'float64'], 'accuracy'
+    )
    topk_out = helper.create_variable_for_type_inference(dtype=input.dtype)
    topk_indices = helper.create_variable_for_type_inference(dtype="int64")
    inputs = {"X": [input]}
@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None):
    else:
        attrs = {'k': k}
    attrs['sorted'] = False
-    helper.append_op(type="top_k_v2",
+    helper.append_op(
+        type="top_k_v2",
        inputs=inputs,
        attrs=attrs,
-                     outputs={
+        outputs={"Out": [topk_out], "Indices": [topk_indices]},
-                         "Out": [topk_out],
+    )
-                         "Indices": [topk_indices]
-                     })
    acc_out = helper.create_variable_for_type_inference(dtype="float32")
    if correct is None:
        correct = helper.create_variable_for_type_inference(dtype="int32")
    if total is None:
        total = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(type="accuracy",
+    helper.append_op(
-                     inputs={
+        type="accuracy",
-                         "Out": [topk_out],
+        inputs={"Out": [topk_out], "Indices": [topk_indices], "Label": [label]},
-                         "Indices": [topk_indices],
-                         "Label": [label]
-                     },
        outputs={
            "Accuracy": [acc_out],
            "Correct": [correct],
            "Total": [total],
-                     })
+        },
+    )
    return acc_out
-def auc(input,
+def auc(
+    input,
    label,
    curve='ROC',
    num_thresholds=2**12 - 1,
    topk=1,
    slide_steps=1,
-        ins_tag_weight=None):
+    ins_tag_weight=None,
+):
    """
    **Area Under the Curve (AUC) Layer**
@@ -216,13 +232,14 @@ def auc(input,
    helper = LayerHelper("auc", **locals())
    if ins_tag_weight is None:
-        ins_tag_weight = tensor.fill_constant(shape=[1, 1],
+        ins_tag_weight = tensor.fill_constant(
-                                              dtype="float32",
+            shape=[1, 1], dtype="float32", value=1.0
-                                              value=1.0)
+        )
    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc')
    check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc')
-    check_variable_and_dtype(ins_tag_weight, 'ins_tag_weight',
+    check_variable_and_dtype(
-                             ['float32', 'float64'], 'auc')
+        ins_tag_weight, 'ins_tag_weight', ['float32', 'float64'], 'auc'
+    )
    auc_out = helper.create_variable_for_type_inference(dtype="float64")
    batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
@@ -236,62 +253,71 @@ def auc(input,
    batch_stat_pos = helper.create_global_variable(
        persistable=True,
        dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
    batch_stat_neg = helper.create_global_variable(
        persistable=True,
        dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
    # for global auc
    # Needn't maintain the batch id
-    stat_pos = helper.create_global_variable(persistable=True,
+    stat_pos = helper.create_global_variable(
-                                             dtype='int64',
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
-                                             shape=[1, num_thresholds + 1])
+    )
-    stat_neg = helper.create_global_variable(persistable=True,
+    stat_neg = helper.create_global_variable(
-                                             dtype='int64',
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
-                                             shape=[1, num_thresholds + 1])
+    )
    for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
-        helper.set_variable_initializer(var, Constant(value=0.0,
+        helper.set_variable_initializer(
-                                                      force_cpu=False))
+            var, Constant(value=0.0, force_cpu=False)
+        )
-    #"InsTagWeight": [ins_tag_weight]
+    # "InsTagWeight": [ins_tag_weight]
    # Batch AUC
-    helper.append_op(type="auc",
+    helper.append_op(
+        type="auc",
        inputs={
            "Predict": [input],
            "Label": [label],
            "StatPos": [batch_stat_pos],
-                         "StatNeg": [batch_stat_neg]
+            "StatNeg": [batch_stat_neg],
        },
        attrs={
            "curve": curve,
            "num_thresholds": num_thresholds,
-                         "slide_steps": slide_steps
+            "slide_steps": slide_steps,
        },
        outputs={
            "AUC": [batch_auc_out],
            "StatPosOut": [batch_stat_pos],
-                         "StatNegOut": [batch_stat_neg]
+            "StatNegOut": [batch_stat_neg],
-                     })
+        },
+    )
    # Global AUC
-    helper.append_op(type="auc",
+    helper.append_op(
+        type="auc",
        inputs={
            "Predict": [input],
            "Label": [label],
            "StatPos": [stat_pos],
-                         "StatNeg": [stat_neg]
+            "StatNeg": [stat_neg],
        },
        attrs={
            "curve": curve,
            "num_thresholds": num_thresholds,
-                         "slide_steps": 0
+            "slide_steps": 0,
        },
        outputs={
            "AUC": [auc_out],
            "StatPosOut": [stat_pos],
-                         "StatNegOut": [stat_neg]
+            "StatNegOut": [stat_neg],
-                     })
+        },
-    return auc_out, batch_auc_out, [
+    )
-        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
+    return (
-    ]
+        auc_out,
+        batch_auc_out,
+        [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg],
+    )
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -241,13 +241,13 @@ def send_ue_recv(
        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
                            The available data type is int32, int64.
-        message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
+        message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
-        reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
+        reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                         Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
+        out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
                                    out_size is smaller or equal to 0, then this input will not be used.
                                    Otherwise, `out_size` should be equal with or larger than
-                                    max(dst_index) + 1.
+                                    max(dst_index) + 1. Default value is `None`.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
@@ -26,6 +26,7 @@ def reindex_graph(
    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
    """
    Reindex Graph API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -49,12 +50,12 @@ def reindex_graph(
                            should be the same with `x`.
        count (Tensor): The neighbor count of the input nodes `x`. And the
                        data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                    and should be filled with -1. Only useful for gpu version.
                                    `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -69,6 +70,7 @@ def reindex_graph(
        .. code-block:: python
            import paddle
            x = [0, 1, 2]
            neighbors = [8, 9, 0, 4, 7, 6, 7]
            count = [2, 3, 2]
@@ -138,6 +140,7 @@ def reindex_heter_graph(
    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
    """
    Reindex HeterGraph API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -161,12 +164,12 @@ def reindex_heter_graph(
                                The data type should be the same with `x`.
        count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
                            And the data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                    and should be filled with -1. Only useful for gpu version.
                                    `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -183,6 +186,7 @@ def reindex_heter_graph(
        .. code-block:: python
            import paddle
            x = [0, 1, 2]
            neighbors_a = [8, 9, 0, 4, 7, 6, 7]
            count_a = [2, 3, 2]

--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -32,6 +32,7 @@ def sample_neighbors(
    name=None,
 ):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -52,16 +53,16 @@ def sample_neighbors(
                         The data type should be the same with `row`.
        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                              data type should be the same with `row`.
-        sample_size (int): The number of neighbors we need to sample. Default value is -1,
+        sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
                           which means returning all the neighbors of the input nodes.
-        eids (Tensor): The eid information of the input graph. If return_eids is True,
+        eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
                            then `eids` should not be None. The data type should be the
                            same with `row`. Default is None.
-        return_eids (bool): Whether to return eid information of sample edges. Default is False.
+        return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
-        perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
+        perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
                              is True, then `perm_buffer` should not be None. The data type should
                              be the same with `row`. If not None, we will use fiser-yates sampling
-                              to speed up. Only useful for gpu version.
+                              to speed up. Only useful for gpu version. Default is None.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
@@ -78,6 +79,7 @@ def sample_neighbors(
        .. code-block:: python
            import paddle
            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -69,8 +69,9 @@ def to_list(value):
 def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase,
+    assert isinstance(
-                            fluid.core.eager.Tensor)), "not a variable"
+        var, (Variable, fluid.core.VarBase, fluid.core.eager.Tensor)
+    ), "not a variable"
    if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
        return var.numpy()
    t = global_scope().find_var(var.name).get_tensor()
@@ -105,10 +106,9 @@ def extract_args(func):
 def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(x,
+    return collective._c_allgather(
-                                   nranks,
+        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream
-                                   ring_id=ring_id,
+    )
-                                   use_calc_stream=use_calc_stream)
 def wait_server_ready(endpoints):
@@ -119,7 +119,8 @@ def wait_server_ready(endpoints):
        for ep in endpoints:
            ip_port = ep.split(":")
            with contextlib.closing(
-                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as sock:
                sock.settimeout(2)
                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                if result != 0:
@@ -131,8 +132,9 @@ def wait_server_ready(endpoints):
            break
-def init_communicator(program, rank, nranks, wait_port, current_endpoint,
+def init_communicator(
-                      endpoints):
+    program, rank, nranks, wait_port, current_endpoint, endpoints
+):
    if nranks < 2:
        return
    other_endpoints = endpoints[:]
@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
        nccl_id_var = block.create_var(
            name=fluid.unique_name.generate('nccl_id'),
            persistable=True,
-            type=fluid.core.VarDesc.VarType.RAW)
+            type=fluid.core.VarDesc.VarType.RAW,
+        )
-        block.append_op(type='c_gen_nccl_id',
+        block.append_op(
+            type='c_gen_nccl_id',
            inputs={},
            outputs={'Out': nccl_id_var},
            attrs={
                'rank': rank,
                'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
+                'other_endpoints': other_endpoints,
-                        })
+            },
+        )
-        block.append_op(type='c_comm_init',
+        block.append_op(
+            type='c_comm_init',
            inputs={'X': nccl_id_var},
            outputs={},
            attrs={
                'nranks': nranks,
                'rank': rank,
                'ring_id': 0,
-                        })
+            },
+        )
    elif core.is_compiled_with_npu():
        hccl_id_var = block.create_var(
            name=fluid.unique_name.generate('hccl_id'),
            persistable=True,
-            type=core.VarDesc.VarType.RAW)
+            type=core.VarDesc.VarType.RAW,
-        block.append_op(type='c_gen_hccl_id',
+        )
+        block.append_op(
+            type='c_gen_hccl_id',
            inputs={},
            outputs={'Out': hccl_id_var},
            attrs={
                'rank': rank,
                'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
+                'other_endpoints': other_endpoints,
-                        })
+            },
-        block.append_op(type='c_comm_init_hccl',
+        )
+        block.append_op(
+            type='c_comm_init_hccl',
            inputs={'X': hccl_id_var},
            outputs={},
            attrs={
                'rank': rank,
                'ring_id': 0,
                'device_id': int(os.getenv("FLAGS_selected_npus")),
-                            'rank_ids': nranks
+                'rank_ids': nranks,
-                        })
+            },
+        )
 def prepare_distributed_context(place=None):
    if place is None:
-        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
+        place = (
+            fluid.CUDAPlace(ParallelEnv().dev_id)
+            if ParallelEnv().nranks > 1
            else fluid.CUDAPlace(0)
+        )
    place = _get_paddle_place(place)
    strategy = fluid.dygraph.parallel.ParallelStrategy()
@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None):
        def _init_context():
            communicator_prog = fluid.Program()
-            init_communicator(communicator_prog, strategy.local_rank,
+            init_communicator(
-                              strategy.nranks, True, strategy.current_endpoint,
+                communicator_prog,
-                              strategy.trainer_endpoints)
+                strategy.local_rank,
+                strategy.nranks,
+                True,
+                strategy.current_endpoint,
+                strategy.trainer_endpoints,
+            )
            exe = fluid.Executor(place)
            exe.run(communicator_prog)
@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None):
            fluid.enable_dygraph(place)
    else:
-        assert ("Only support CUDAPlace for now.")
+        assert "Only support CUDAPlace for now."
    _parallel_context_initialized = True
    return strategy
@@ -246,7 +266,9 @@ def _update_input_info(inputs):
 class StaticGraphAdapter(object):
    """
    Model traning/inference with a static graph.
    """
    def __init__(self, model):
@@ -269,7 +291,7 @@ class StaticGraphAdapter(object):
            'eval_total': 0,
            'test_total': 0,
            'eval_batch': 0,
-            'test_batch': 0
+            'test_batch': 0,
        }
        self._nranks = ParallelEnv().nranks
@@ -289,10 +311,13 @@ class StaticGraphAdapter(object):
        self.model.mode = value
    def train_batch(self, inputs, labels=None, update=True):
-        assert self.model._optimizer, \
+        assert (
-            "model not ready, please call `model.prepare()` first"
+            self.model._optimizer
+        ), "model not ready, please call `model.prepare()` first"
        self.mode = 'train'
-        assert update is True, "Does not support `update == False` in static mode by now."
+        assert (
+            update is True
+        ), "Does not support `update == False` in static mode by now."
        return self._run(inputs, labels)
    def eval_batch(self, inputs, labels=None):
@@ -307,7 +332,6 @@ class StaticGraphAdapter(object):
        return self.model.network.parameters(*args, **kwargs)
    def save(self, path):
        def _save(state, path):
            if not state:
                return
@@ -331,8 +355,7 @@ class StaticGraphAdapter(object):
        # XXX `optimizer.state_dict()` only work in dygraph mode
        optim_path = path + ".pdopt"
        optim = {
-            p.name: p
+            p.name: p for p in filter(is_belong_to_optimizer, prog.list_vars())
-            for p in filter(is_belong_to_optimizer, prog.list_vars())
        }
        if not optim:
            return
@@ -348,8 +371,10 @@ class StaticGraphAdapter(object):
        # restore parameter states
        fluid.core._create_loaded_parameter(
-            [param for param, state in param_state_pairs], global_scope(),
+            [param for param, state in param_state_pairs],
-            executor)
+            global_scope(),
+            executor,
+        )
        for param, state in param_state_pairs:
            self._set_var(param, state)
@@ -377,9 +402,10 @@ class StaticGraphAdapter(object):
                # static-graph, since the time of global_step to increase is
                # different.
                state_val = (
-                    np.array(converted_state.pop("global_step")) - 1
+                    (np.array(converted_state.pop("global_step")) - 1)
-                ) if "global_step" in converted_state else converted_state.pop(
+                    if "global_step" in converted_state
-                    "@LR_DECAY_COUNTER@", None)
+                    else converted_state.pop("@LR_DECAY_COUNTER@", None)
+                )
                if state_val is not None:
                    converted_state[var.name] = state_val
            elif var.name.startswith("learning_rate_"):
@@ -396,36 +422,61 @@ class StaticGraphAdapter(object):
                    opt_cls_name = self.model._optimizer.__class__.__name__
                    opt_unq_name = None
                    for name in self.model._optimizer._accumulators.keys():
-                        accum_name = name if opt_name is None else name[
+                        accum_name = (
-                            len(opt_name) + 1:]
+                            name
-                        for param_name, state_var in self.model._optimizer._accumulators[
+                            if opt_name is None
-                                name].items():
+                            else name[len(opt_name) + 1 :]
+                        )
+                        for (
+                            param_name,
+                            state_var,
+                        ) in self.model._optimizer._accumulators[name].items():
                            if opt_unq_name is None:
                                # can not infer out the exact unique(opt_name),
                                # thus try to extract rather than generate
-                                for state_key in sorted(state.keys(),
+                                for state_key in sorted(
+                                    state.keys(),
                                    key=lambda x: len(x),
-                                                        reverse=True):
+                                    reverse=True,
-                                    prefix = param_name + "_" + (
+                                ):
+                                    prefix = (
+                                        param_name
+                                        + "_"
+                                        + (
                                            opt_cls_name
-                                        if opt_name is None else opt_name) + "_"
+                                            if opt_name is None
+                                            else opt_name
+                                        )
+                                        + "_"
+                                    )
                                    if state_key.startswith(prefix):
-                                        prefix_offset = state_key[len(
+                                        prefix_offset = state_key[
-                                            prefix):].find("_") + len(prefix)
+                                            len(prefix) :
+                                        ].find("_") + len(prefix)
                                        opt_unq_name = state_key[
-                                            len(param_name + "_"):prefix_offset]
+                                            len(
+                                                param_name + "_"
+                                            ) : prefix_offset
+                                        ]
                                        # TODO: assert
                                        # assert opt_unq_name is None
                                    # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
                                    # always end with "_0" since the unique optimizer._name
-                            dy_state_name = (param_name + "_" + opt_unq_name +
+                            dy_state_name = (
-                                             "_" + accum_name + "_0")
+                                param_name
+                                + "_"
+                                + opt_unq_name
+                                + "_"
+                                + accum_name
+                                + "_0"
+                            )
                            converted_state[
-                                state_var.name] = converted_state.pop(
+                                state_var.name
-                                    dy_state_name)
+                            ] = converted_state.pop(dy_state_name)
-            assert var.name in converted_state, \
+            assert (
-                "variable [{}] is not in optimizer state file".format(var.name)
+                var.name in converted_state
+            ), "variable [{}] is not in optimizer state file".format(var.name)
            self._set_var(var, converted_state[var.name])
    def _set_var(self, var, ndarray):
@@ -444,15 +495,17 @@ class StaticGraphAdapter(object):
    def _run(self, inputs, labels=None):
        compiled_prog = self._compiled_progs.get(self.mode, None)
-        assert compiled_prog, \
+        assert (
-            "Model is not ready, please call `model.prepare()` first"
+            compiled_prog
+        ), "Model is not ready, please call `model.prepare()` first"
        inputs = to_list(inputs)
        if labels is not None:
            labels = to_list(labels)
-        assert len(inputs) == len(self._input_vars[self.mode]), \
+        assert len(inputs) == len(self._input_vars[self.mode]), (
-            "number of inputs" \
+            "number of inputs"
            + " does not match number of arguments of `forward` method"
+        )
        feed = {}
        input_names = [v.name for v in self._input_vars[self.mode]]
@@ -462,8 +515,10 @@ class StaticGraphAdapter(object):
            # train and test may take different arguments
            if inputs[idx] is not None:
                feed[n] = inputs[idx]
-            if self._amp_level == 'O2' and input_dtypes[
+            if (
-                    idx] == core.VarDesc.VarType.FP16:
+                self._amp_level == 'O2'
+                and input_dtypes[idx] == core.VarDesc.VarType.FP16
+            ):
                if isinstance(feed[n], core.LoDTensor):
                    feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
                elif isinstance(feed[n], np.array):
@@ -491,10 +546,12 @@ class StaticGraphAdapter(object):
            else:
                pruned_fetch_list.append(fetch_var)
-        rets = self._executor.run(compiled_prog,
+        rets = self._executor.run(
+            compiled_prog,
            feed=feed,
            fetch_list=pruned_fetch_list,
-                                  return_numpy=False)
+            return_numpy=False,
+        )
        # restore pruned fetch_list Variable from feeds
        for i, name in enumerate(pruned_fetch_idx_name_map):
@@ -510,20 +567,24 @@ class StaticGraphAdapter(object):
        metrics = []
        for metric, state in zip(self.model._metrics, metric_states):
            # cut off padding size
-            if self.mode != 'train' and self.model._test_dataloader is not None \
+            if (
-                    and isinstance(self.model._test_dataloader, DataLoader) \
+                self.mode != 'train'
-                    and self._nranks > 1:
+                and self.model._test_dataloader is not None
+                and isinstance(self.model._test_dataloader, DataLoader)
+                and self._nranks > 1
+            ):
                total_size = len(self.model._test_dataloader.dataset)
                # TODO: fixme if have better way to get batch size
                samples = state[0].shape[0]
                current_count = self._merge_count.get(self.mode + '_total', 0)
                if current_count + samples >= total_size:
                    state = [
-                        s[:int(total_size - current_count), ...] for s in state
+                        s[: int(total_size - current_count), ...] for s in state
                    ]
                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode + '_batch'] = int(total_size -
+                    self._merge_count[self.mode + '_batch'] = int(
-                                                                  current_count)
+                        total_size - current_count
+                    )
                else:
                    self._merge_count[self.mode + '_total'] += samples
                    self._merge_count[self.mode + '_batch'] = samples
@@ -555,8 +616,11 @@ class StaticGraphAdapter(object):
        if mode != 'train':
            for op in list(prog.global_block().ops):
                prog.global_block()._remove_op(0)
-        if mode == 'train' and self.model._optimizer \
+        if (
-                and self.model._optimizer._learning_rate_map:
+            mode == 'train'
+            and self.model._optimizer
+            and self.model._optimizer._learning_rate_map
+        ):
            # HACK workaround learning rate map issue
            lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
            new_lr_var = prog.global_block().vars[lr_var.name]
@@ -594,20 +658,27 @@ class StaticGraphAdapter(object):
                        dist_strategy.amp = True
                        dist_strategy.amp_configs = self._amp_configs.copy()
                        dist_strategy.amp_configs.update(self._amp_custom_lists)
-                        dist_strategy.amp_configs[
+                        dist_strategy.amp_configs['use_pure_fp16'] = (
-                            'use_pure_fp16'] = self._amp_level == 'O2'
+                            self._amp_level == 'O2'
+                        )
                    self.model._optimizer = fleet.distributed_optimizer(
-                        self.model._optimizer, strategy=dist_strategy)
+                        self.model._optimizer, strategy=dist_strategy
+                    )
                elif self._amp_level != "O0" and core.is_compiled_with_cuda:
-                    amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
+                    amp_lists = (
+                        paddle.static.amp.AutoMixedPrecisionLists(
                            **self._amp_custom_lists
-                    ) if self._amp_custom_lists else None
+                        )
+                        if self._amp_custom_lists
+                        else None
+                    )
                    self.model._optimizer = paddle.static.amp.decorate(
                        self.model._optimizer,
                        amp_lists=amp_lists,
                        use_pure_fp16=self._amp_level == "O2",
                        use_fp16_guard=self._use_fp16_guard,
-                        **self._amp_configs)
+                        **self._amp_configs
+                    )
                self.model._optimizer.minimize(self._loss_endpoint)
@@ -620,7 +691,7 @@ class StaticGraphAdapter(object):
        self._endpoints[mode] = {
            "output": outputs,
            "loss": to_list(losses),
-            "metric": metrics
+            "metric": metrics,
        }
    def _compile_and_initialize(self, prog, mode):
@@ -628,8 +699,9 @@ class StaticGraphAdapter(object):
        if compiled_prog is not None:
            return compiled_prog
-        assert self.model._place is not None, \
+        assert (
-            "device is not set, please call `model.prepare()` first"
+            self.model._place is not None
+        ), "device is not set, please call `model.prepare()` first"
        place = self.model._place
@@ -642,8 +714,11 @@ class StaticGraphAdapter(object):
            uninitialized = []
            for var_py in self._startup_prog.list_vars():
                var = fluid.global_scope().find_var(var_py.name)
-                if not var_py.name.startswith('nccl_id') and var and \
+                if (
-                        var.get_tensor()._is_initialized():
+                    not var_py.name.startswith('nccl_id')
+                    and var
+                    and var.get_tensor()._is_initialized()
+                ):
                    continue
                uninitialized.append(var_py)
@@ -651,7 +726,10 @@ class StaticGraphAdapter(object):
                startup_prog = self._startup_prog._prune(uninitialized)
                self._executor.run(startup_prog)
-        if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
+        if (
+            self._amp_level == "O2"
+            and mode == 'train'
+            and core.is_compiled_with_cuda()
        ):
            self.model._optimizer.amp_init(place)
@@ -664,7 +742,6 @@ class StaticGraphAdapter(object):
 class DynamicGraphAdapter(object):
    def __init__(self, model):
        super(DynamicGraphAdapter, self).__init__()
        self.model = model
@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object):
            'eval_total': 0,
            'test_total': 0,
            'eval_batch': 0,
-            'test_batch': 0
+            'test_batch': 0,
        }
        self._input_info = None
@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object):
            stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
            stradegy.current_endpoint = ParallelEnv().current_endpoint
            self.ddp_model = fluid.dygraph.parallel.DataParallel(
-                self.model.network, stradegy)
+                self.model.network, stradegy
+            )
    @property
    def mode(self):
@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object):
    # TODO multi device in dygraph mode not implemented at present time
    def train_batch(self, inputs, labels=None, update=True):
-        assert self.model._optimizer, \
+        assert (
-            "model not ready, please call `model.prepare()` first"
+            self.model._optimizer
+        ), "model not ready, please call `model.prepare()` first"
        self.model.network.train()
        self.mode = 'train'
        inputs = to_list(inputs)
@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object):
        if self._amp_level != "O0" and self.model._scaler is None:
            self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
-        with paddle.amp.auto_cast(enable=self._amp_level != 'O0',
+        with paddle.amp.auto_cast(
+            enable=self._amp_level != 'O0',
            **self._amp_custom_lists,
-                                  level=self._amp_level):
+            level=self._amp_level
+        ):
            if self._nranks > 1:
                outputs = self.ddp_model(*[to_variable(x) for x in inputs])
            else:
@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object):
            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
            metrics.append(m)
-        return ([to_numpy(l) for l in losses], metrics) \
+        return (
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
+            ([to_numpy(l) for l in losses], metrics)
+            if len(metrics) > 0
+            else [to_numpy(l) for l in losses]
+        )
    def eval_batch(self, inputs, labels=None):
        self.model.network.eval()
@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object):
        metrics = []
        for metric in self.model._metrics:
            # cut off padding value.
-            if self.model._test_dataloader is not None and self._nranks > 1 \
+            if (
-                    and isinstance(self.model._test_dataloader, DataLoader):
+                self.model._test_dataloader is not None
+                and self._nranks > 1
+                and isinstance(self.model._test_dataloader, DataLoader)
+            ):
                total_size = len(self.model._test_dataloader.dataset)
                samples = outputs[0].shape[0]
                current_count = self._merge_count.get(self.mode + '_total', 0)
                if current_count + samples >= total_size:
                    outputs = [
-                        o[:int(total_size - current_count)] for o in outputs
+                        o[: int(total_size - current_count)] for o in outputs
                    ]
                    labels = [
-                        l[:int(total_size - current_count)] for l in labels
+                        l[: int(total_size - current_count)] for l in labels
                    ]
                    self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode + '_batch'] = int(total_size -
+                    self._merge_count[self.mode + '_batch'] = int(
-                                                                  current_count)
+                        total_size - current_count
+                    )
                else:
                    self._merge_count[self.mode + '_total'] += samples
                    self._merge_count[self.mode + '_batch'] = samples
@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object):
            opt_unq_name = ''
        opt_cls_name = self.model._optimizer.__class__.__name__
-        opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
+        opt_name = opt_unq_name[: opt_unq_name.rfind("_")]  # remove suffix idx
        param_names = [param.name for param in self.model.network.parameters()]
-        for var_name, state_var in sorted(optim_state.items(),
+        for var_name, state_var in sorted(
-                                          key=lambda x: len(x[0]),
+            optim_state.items(), key=lambda x: len(x[0]), reverse=True
-                                          reverse=True):
+        ):
            if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
                # NOTE: dygraph saved global_step is 1 larger than that in
                # static-graph, since the time of global_step to increase is
                # different.
                if var_name == "@LR_DECAY_COUNTER@":
-                    converted_state["global_step"] = np.array(
+                    converted_state["global_step"] = (
-                        converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+                        np.array(converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+                    )
            else:
                # moment and other accumulators
                # extend state dict to include promising dygraph names
                for param_name in param_names:
                    if var_name.startswith(param_name + "_" + opt_name):
                        # when init optimizer with name
-                        accum_name = var_name[len(param_name + "_" + opt_name +
+                        accum_name = var_name[
-                                                  "_"):]
+                            len(param_name + "_" + opt_name + "_") :
-                    elif var_name.startswith(param_name +
+                        ]
-                                             "_") and opt_name == opt_cls_name:
+                    elif (
+                        var_name.startswith(param_name + "_")
+                        and opt_name == opt_cls_name
+                    ):
                        # when init optimizer without name
-                        accum_name = var_name[len(param_name + "_"):]
+                        accum_name = var_name[len(param_name + "_") :]
                    else:
                        continue
                    # remove suffix idx
-                    accum_name = accum_name[:accum_name.rfind("_")]
+                    accum_name = accum_name[: accum_name.rfind("_")]
                    # state names always end with "_0" in dygraph because of the
                    # unique optimizer._name
-                    dy_state_name = (param_name + "_" + opt_unq_name + "_" +
+                    dy_state_name = (
-                                     accum_name + "_0")
+                        param_name
+                        + "_"
+                        + opt_unq_name
+                        + "_"
+                        + accum_name
+                        + "_0"
+                    )
                    converted_state[dy_state_name] = state_var
        if not hasattr(self.model._optimizer, 'set_state_dict'):
@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object):
            self.model._optimizer.set_state_dict(converted_state)
    def prepare(self):
-        if self._amp_level == "O2" and self.model.mode == 'train' and core.is_compiled_with_cuda(
+        if (
+            self._amp_level == "O2"
+            and self.model.mode == 'train'
+            and core.is_compiled_with_cuda()
        ):
            self.model.network, self.model._optimizer = paddle.amp.decorate(
                models=self.model.network,
                optimizers=self.model._optimizer,
-                level='O2')
+                level='O2',
+            )
        if self._amp_level != "O0":
            self.model._scaler = None
 class Model(object):
    """
    An Model object is network with training and inference features.
    Dynamic graph and static graph are supported at the same time,
    switched by `paddle.enable_static()`. The usage is as follows.
@@ -1053,6 +1156,7 @@ class Model(object):
    def train_batch(self, inputs, labels=None, update=True):
        """
        Run one training step on one batch of data. And using `update` indicates
        whether optimizer update gradients computing by this batch.
@@ -1098,6 +1202,7 @@ class Model(object):
                loss = model.train_batch([data], [label])
                print(loss)
                # [array([2.192784], dtype=float32)]
        """
        loss = self._adapter.train_batch(inputs, labels, update)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1107,6 +1212,7 @@ class Model(object):
    @no_grad()
    def eval_batch(self, inputs, labels=None):
        """
        Run one evaluating step on a batch of data.
        Args:
@@ -1150,6 +1256,7 @@ class Model(object):
                loss, acc = model.eval_batch([data], [label])
                print(loss, acc)
                # [array([2.8825705], dtype=float32)] [0.0]
        """
        loss = self._adapter.eval_batch(inputs, labels)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1159,6 +1266,7 @@ class Model(object):
    @no_grad()
    def predict_batch(self, inputs):
        """
        Run one predicting step on a batch of data.
        Args:
@@ -1197,6 +1305,7 @@ class Model(object):
                # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
                #          0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
                #          dtype=float32)]
        """
        loss = self._adapter.predict_batch(inputs)
        if fluid._non_static_mode() and self._input_info is None:
@@ -1205,6 +1314,7 @@ class Model(object):
    def save(self, path, training=True):
        """
        This function saves parameters, optimizer information or model and
        paramters only for inference to path. It depends on the parameter
        `training`.
@@ -1272,6 +1382,7 @@ class Model(object):
                model.fit(data, epochs=1, batch_size=32, verbose=0)
                model.save('checkpoint/test')  # save for training
                model.save('inference_model', False)  # save for inference
        """
        if ParallelEnv().local_rank == 0:
@@ -1282,6 +1393,7 @@ class Model(object):
    def load(self, path, skip_mismatch=False, reset_optimizer=False):
        """
        Load from files storing the model states and optimizer states. The file
        for optimizer states is not necessary if no need to restore the optimizer.
@@ -1329,6 +1441,7 @@ class Model(object):
                model.save('checkpoint/test')
                model.load('checkpoint/test')
        """
        def _load_state_from_path(path):
@@ -1341,17 +1454,24 @@ class Model(object):
            state = param_state.get(key, None)
            if state is None:
                raise ValueError(
-                    "{} is not found in the providing file.".format(key))
+                    "{} is not found in the providing file.".format(key)
+                )
            if list(state.shape) != list(param.shape):
                raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
+                    "{} receives a shape {}, but the expected shape is {}.".format(
-                    format(key, list(state.shape), list(param.shape)))
+                        key, list(state.shape), list(param.shape)
+                    )
+                )
            return param, state
        def _strip_postfix(path):
            path, ext = os.path.splitext(path)
-            assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
+            assert ext in [
-                    "Unknown postfix {} from weights".format(ext)
+                '',
+                '.pdparams',
+                '.pdopt',
+                '.pdmodel',
+            ], "Unknown postfix {} from weights".format(ext)
            return path
        path = _strip_postfix(path)
@@ -1365,15 +1485,17 @@ class Model(object):
            except ValueError as err:
                if skip_mismatch:
                    warnings.warn(
-                        ("Skip loading for {}. ".format(key) + str(err)))
+                        ("Skip loading for {}. ".format(key) + str(err))
+                    )
                    # reset optimizer when mismatch happens
                    reset_optimizer = True
                else:
                    raise err
            matched_param_state.append(match_res)
-        optim_state = None if reset_optimizer else _load_state_from_path(
+        optim_state = (
-            path + ".pdopt")
+            None if reset_optimizer else _load_state_from_path(path + ".pdopt")
+        )
        # TODO: support save/load scaler state in static graph
        if _non_static_mode():
@@ -1382,13 +1504,15 @@ class Model(object):
                if os.path.exists(path + '.pdscaler'):
                    scaler_state = paddle.load(path + '.pdscaler')
-            return self._adapter.load(matched_param_state, optim_state,
+            return self._adapter.load(
-                                      scaler_state)
+                matched_param_state, optim_state, scaler_state
+            )
        else:
            return self._adapter.load(matched_param_state, optim_state)
    def parameters(self, *args, **kwargs):
        """
        Returns a list of parameters of the model.
        Returns:
@@ -1411,17 +1535,19 @@ class Model(object):
                    nn.Linear(200, 10)), input)
                params = model.parameters()
        """
        return self._adapter.parameters()
    def _prepare_amp(self, amp_configs):
        def _check_pure_fp16_configs():
            # pure float16 training has some restricts now
            if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
                # clip by value is not supported
-                assert isinstance(self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm)), \
+                assert isinstance(
-                     "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
+                    self._optimizer._grad_clip,
+                    (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
+                ), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
        self._adapter._amp_custom_lists = {}
        self._adapter._amp_configs = {}
@@ -1433,7 +1559,8 @@ class Model(object):
        elif isinstance(amp_configs, str):
            if amp_configs not in ('O0', 'O1', 'O2'):
                raise ValueError(
-                    "The level of amp_configs should be 'O0', 'O1' or 'O2'.")
+                    "The level of amp_configs should be 'O0', 'O1' or 'O2'."
+                )
            self._adapter._amp_level = amp_configs
            _check_pure_fp16_configs()
            return
@@ -1442,7 +1569,8 @@ class Model(object):
                self._adapter._amp_level = 'O1'
            elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
                raise ValueError(
-                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
+                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'."
+                )
            else:
                self._adapter._amp_level = amp_configs['level']
        amp_config_key_set = set(amp_configs.keys()) - {'level'}
@@ -1459,12 +1587,14 @@ class Model(object):
        # construct amp_custom_lists
        if self._adapter._amp_level != 'O0' and amp_config_key_set:
            for param_name in [
-                    'custom_white_list', 'custom_black_list',
+                'custom_white_list',
-                    'custom_black_varnames'
+                'custom_black_list',
+                'custom_black_varnames',
            ]:
                if param_name in amp_config_key_set:
                    self._adapter._amp_custom_lists[param_name] = amp_configs[
-                        param_name]
+                        param_name
+                    ]
                    amp_config_key_set -= {param_name}
        def _check_amp_configs(amp_config_key_set):
@@ -1479,13 +1609,16 @@ class Model(object):
            }
            if amp_config_key_set - accepted_param_set:
                raise ValueError(
-                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized."
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
-                    .format(tuple(amp_config_key_set - accepted_param_set)))
+                        tuple(amp_config_key_set - accepted_param_set)
+                    )
+                )
            if 'use_fp16_guard' in amp_config_key_set:
                if _non_static_mode():
                    raise ValueError(
-                        "'use_fp16_guard' is supported in static mode only.")
+                        "'use_fp16_guard' is supported in static mode only."
+                    )
                self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
                amp_config_key_set.remove('use_fp16_guard')
@@ -1495,12 +1628,11 @@ class Model(object):
        for key in amp_configs_set:
            self._adapter._amp_configs[key] = amp_configs[key]
-    def prepare(self,
+    def prepare(
-                optimizer=None,
+        self, optimizer=None, loss=None, metrics=None, amp_configs=None
-                loss=None,
+    ):
-                metrics=None,
-                amp_configs=None):
        """
        Configures the model before runing.
        Args:
@@ -1532,6 +1664,7 @@ class Model(object):
        Returns:
            None
        """
        self._place = _get_device()
        if isinstance(self._place, fluid.CUDAPlace):
@@ -1539,15 +1672,17 @@ class Model(object):
            if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
                if fluid._non_static_mode():
                    main_prog_seed = fluid.default_main_program().random_seed
-                    startup_prog_seed = fluid.default_startup_program(
+                    startup_prog_seed = (
-                    ).random_seed
+                        fluid.default_startup_program().random_seed
+                    )
                    fluid.disable_dygraph()
                    paddle.disable_static(self._place)
                    # enable_dygraph would create and switch to a new program,
                    # thus also copy seed to the new program
                    fluid.default_main_program().random_seed = main_prog_seed
-                    fluid.default_startup_program(
+                    fluid.default_startup_program().random_seed = (
-                    ).random_seed = startup_prog_seed
+                        startup_prog_seed
+                    )
                else:
                    prepare_distributed_context(self._place)
                _parallel_context_initialized = True
@@ -1562,15 +1697,16 @@ class Model(object):
        metrics = metrics or []
        for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
+            assert isinstance(
-                "{} is not sub class of Metric".format(
+                metric, Metric
-                    metric.__class__.__name__)
+            ), "{} is not sub class of Metric".format(metric.__class__.__name__)
        self._metrics = to_list(metrics)
        self._prepare_amp(amp_configs)
        self._adapter.prepare()
-    def fit(self,
+    def fit(
+        self,
        train_data=None,
        eval_data=None,
        batch_size=1,
@@ -1585,8 +1721,10 @@ class Model(object):
        num_workers=0,
        callbacks=None,
        accumulate_grad_batches=1,
-            num_iters=None):
+        num_iters=None,
+    ):
        """
        Trains the model for a fixed number of epochs. If `eval_data` is set,
        evaluation will be done at the end of each epoch.
@@ -1641,7 +1779,7 @@ class Model(object):
               How to make a batch is done internally.
            .. code-block:: python
-              :name: code-example1
+              :name: code-example3
                import paddle
                import paddle.vision.transforms as T
@@ -1681,7 +1819,7 @@ class Model(object):
               DataLoader.
            .. code-block:: python
-              :name: code-example2
+              :name: code-example4
                import paddle
                import paddle.vision.transforms as T
@@ -1718,31 +1856,38 @@ class Model(object):
                            val_loader,
                            epochs=2,
                            save_dir='mnist_checkpoint')
        """
-        assert train_data is not None, \
+        assert train_data is not None, "train_data must be given!"
-                "train_data must be given!"
        if isinstance(train_data, Dataset):
-            train_sampler = DistributedBatchSampler(train_data,
+            train_sampler = DistributedBatchSampler(
+                train_data,
                batch_size=batch_size,
                shuffle=shuffle,
-                                                    drop_last=drop_last)
+                drop_last=drop_last,
-            train_loader = DataLoader(train_data,
+            )
+            train_loader = DataLoader(
+                train_data,
                batch_sampler=train_sampler,
                places=self._place,
                num_workers=num_workers,
-                                      return_list=True)
+                return_list=True,
+            )
        else:
            train_loader = train_data
        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(eval_data,
+            eval_sampler = DistributedBatchSampler(
-                                                   batch_size=batch_size)
+                eval_data, batch_size=batch_size
-            eval_loader = DataLoader(eval_data,
+            )
+            eval_loader = DataLoader(
+                eval_data,
                batch_sampler=eval_sampler,
                places=self._place,
                num_workers=num_workers,
-                                     return_list=True)
+                return_list=True,
+            )
        elif eval_data is not None:
            eval_loader = eval_data
        else:
@@ -1755,8 +1900,11 @@ class Model(object):
        steps = self._len_data_loader(train_loader)
        self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int) and isinstance(
+        if (
-                steps, int):
+            num_iters is not None
+            and isinstance(num_iters, int)
+            and isinstance(steps, int)
+        ):
            assert num_iters > 0, "num_iters must be greater than 0!"
            epochs = (num_iters // steps) + 1
            steps = min(num_iters, steps)
@@ -1784,10 +1932,10 @@ class Model(object):
            if do_eval and epoch % eval_freq == 0:
                eval_steps = self._len_data_loader(eval_loader)
-                cbks.on_begin('eval', {
+                cbks.on_begin(
-                    'steps': eval_steps,
+                    'eval',
-                    'metrics': self._metrics_name()
+                    {'steps': eval_steps, 'metrics': self._metrics_name()},
-                })
+                )
                eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
@@ -1798,14 +1946,16 @@ class Model(object):
        cbks.on_end('train', logs)
        self._test_dataloader = None
-    def evaluate(self,
+    def evaluate(
+        self,
        eval_data,
        batch_size=1,
        log_freq=10,
        verbose=2,
        num_workers=0,
        callbacks=None,
-                 num_iters=None):
+        num_iters=None,
+    ):
        """
        Evaluate the loss and metrics of the model on input dataset.
@@ -1859,13 +2009,16 @@ class Model(object):
        """
        if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(eval_data,
+            eval_sampler = DistributedBatchSampler(
-                                                   batch_size=batch_size)
+                eval_data, batch_size=batch_size
-            eval_loader = DataLoader(eval_data,
+            )
+            eval_loader = DataLoader(
+                eval_data,
                batch_sampler=eval_sampler,
                places=self._place,
                num_workers=num_workers,
-                                     return_list=True)
+                return_list=True,
+            )
        else:
            eval_loader = eval_data
@@ -1881,15 +2034,17 @@ class Model(object):
        eval_steps = self._len_data_loader(eval_loader)
        self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int) and isinstance(
+        if (
-                eval_steps, int):
+            num_iters is not None
+            and isinstance(num_iters, int)
+            and isinstance(eval_steps, int)
+        ):
            assert num_iters > 0, "num_iters must be greater than 0!"
            eval_steps = min(num_iters, eval_steps)
            self.num_iters = eval_steps
-        cbks.on_begin('eval', {
+        cbks.on_begin(
-            'steps': eval_steps,
+            'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
-            'metrics': self._metrics_name()
+        )
-        })
        logs = self._run_one_epoch(eval_loader, cbks, 'eval')
@@ -1903,13 +2058,15 @@ class Model(object):
        return eval_result
-    def predict(self,
+    def predict(
+        self,
        test_data,
        batch_size=1,
        num_workers=0,
        stack_outputs=False,
        verbose=1,
-                callbacks=None):
+        callbacks=None,
+    ):
        """
        Compute the output predictions on testing data.
@@ -1980,13 +2137,16 @@ class Model(object):
        """
        if test_data is not None and isinstance(test_data, Dataset):
-            test_sampler = DistributedBatchSampler(test_data,
+            test_sampler = DistributedBatchSampler(
-                                                   batch_size=batch_size)
+                test_data, batch_size=batch_size
-            test_loader = DataLoader(test_data,
+            )
+            test_loader = DataLoader(
+                test_data,
                batch_sampler=test_sampler,
                places=self._place,
                num_workers=num_workers,
-                                     return_list=True)
+                return_list=True,
+            )
        else:
            test_loader = test_data
@@ -2036,7 +2196,8 @@ class Model(object):
                if self._is_shape_inferred:
                    warnings.warn(
                        "'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
-                        % self._input_info[0])
+                        % self._input_info[0]
+                    )
                paddle.jit.save(layer, path, input_spec=self._inputs)
@@ -2047,7 +2208,8 @@ class Model(object):
                raise ValueError(
                    "The input path MUST be format of dirname/file_prefix "
                    "[dirname\\file_prefix in Windows system], but received "
-                    "file_prefix is empty string.")
+                    "file_prefix is empty string."
+                )
            dirname = os.path.dirname(path)
            if dirname and not os.path.exists(dirname):
@@ -2058,21 +2220,24 @@ class Model(object):
            params_filename = file_prefix + INFER_PARAMS_SUFFIX
            prog = self._adapter._progs.get('test', None)
-            assert prog, \
+            assert (
-                "Model is not ready, please call `model.prepare()` first"
+                prog
+            ), "Model is not ready, please call `model.prepare()` first"
            infer_prog = prog.clone(for_test=True)
            input_names = [v.name for v in self._adapter._input_vars['test']]
            endpoints = self._adapter._endpoints['test']['output']
-            fluid.io.save_inference_model(model_path,
+            fluid.io.save_inference_model(
+                model_path,
                input_names,
                endpoints,
                self._adapter._executor,
                main_program=infer_prog,
                model_filename=model_filename,
-                                          params_filename=params_filename)
+                params_filename=params_filename,
+            )
    def _run_one_epoch(
        self,
@@ -2098,16 +2263,21 @@ class Model(object):
            # LoDTensor.shape is callable, where LoDTensor comes from
            # DataLoader in static graph
-            batch_size = data[0].shape()[0] if callable(
+            batch_size = (
-                data[0].shape) else data[0].shape[0]
+                data[0].shape()[0]
+                if callable(data[0].shape)
+                else data[0].shape[0]
+            )
            callbacks.on_batch_begin(mode, step, logs)
            if mode != 'predict':
-                _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
+                _inputs = [data[: len(self._inputs)], data[len(self._inputs) :]]
                if mode == 'train':
-                    _inputs.append((step + 1) % self._accumulate == 0
+                    _inputs.append(
-                                   or step + 1 == len(data_loader))
+                        (step + 1) % self._accumulate == 0
+                        or step + 1 == len(data_loader)
+                    )
                outs = getattr(self, mode + '_batch')(*_inputs)
@@ -2128,15 +2298,17 @@ class Model(object):
                    logs[k] = v
            else:
                if self._inputs is not None:
-                    outs = self.predict_batch(data[:len(self._inputs)])
+                    outs = self.predict_batch(data[: len(self._inputs)])
                else:
                    outs = self.predict_batch(data)
                outputs.append(outs)
            logs['step'] = step
-            if mode == 'train' or self._adapter._merge_count.get(
+            if (
-                    mode + '_batch', 0) <= 0:
+                mode == 'train'
+                or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
+            ):
                logs['batch_size'] = batch_size * ParallelEnv().nranks
            else:
                logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
@@ -2190,8 +2362,9 @@ class Model(object):
                # {'total_params': 61610, 'trainable_params': 61610}
        """
-        assert (input_size is not None or self._inputs
+        assert (
-                is not None), "'input_size' or 'self._input' must be set"
+            input_size is not None or self._inputs is not None
+        ), "'input_size' or 'self._input' must be set"
        if input_size is not None:
            _input_size = input_size
        else:
@@ -2208,7 +2381,10 @@ class Model(object):
            if is_input:
                arg_names = extract_args(self.network.forward)[1:]
                # While Saving inference model in dygraph, and providing inputs only in running.
-                if shapes is not None and dtypes is not None and fluid._non_static_mode(
+                if (
+                    shapes is not None
+                    and dtypes is not None
+                    and fluid._non_static_mode()
                ):
                    out_specs = [
                        Input(name=n, dtype=dtypes[i], shape=shapes[i])
@@ -2221,7 +2397,8 @@ class Model(object):
        elif isinstance(specs, dict):
            assert is_input is False
            out_specs = [
-                specs[n] for n in extract_args(self.network.forward)
+                specs[n]
+                for n in extract_args(self.network.forward)
                if n != 'self'
            ]
        else:
@@ -2232,8 +2409,10 @@ class Model(object):
                assert isinstance(spec, Input)
                if spec.name is None:
                    raise ValueError(
-                        "Requires Input[{}].name != None, but receive `None` with {}."
+                        "Requires Input[{}].name != None, but receive `None` with {}.".format(
-                        .format(i, spec))
+                            i, spec
+                        )
+                    )
        return out_specs
@@ -2258,6 +2437,7 @@ class Model(object):
        "Update self._inputs according to given inputs."
        self._input_info = self._adapter._input_info
        if self._input_info is not None and len(self._input_info) == 2:
-            self._inputs = self._verify_spec(None, self._input_info[0],
+            self._inputs = self._verify_spec(
-                                             self._input_info[1], True)
+                None, self._input_info[0], self._input_info[1], True
+            )
            self._is_shape_inferred = True
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm(
    name=None,
 ):
    r"""
    The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
    .. code-block:: python
        y = layer_norm(residual + dropout(bias + x))
    Parameters:
@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor: The output Tensor, the data type and shape is same as `x`.
+        Tensor, The output Tensor, the data type and shape is same as `x`.
    Examples:
        .. code-block:: python
            # required: gpu
@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm(
                x, residual, bias)
            # [2, 4, 128]
            print(output.shape)
    """
    seed = None
    if mode not in ('downscale_in_infer', 'upscale_in_train'):

--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f
 from paddle.nn import Layer
 from paddle.framework import ParamAttr
 import paddle
-from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list
+from paddle.nn.layer.transformer import (
+    _convert_attention_mask,
+    _convert_param_attr_to_list,
+)
 from paddle.nn.initializer import Constant
 from paddle.fluid.dygraph import no_grad
 from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode
@@ -51,7 +54,8 @@ def _to_dtype(t, dtype):
    if t.place.is_gpu_place():
        size_dtype = core.size_of_dtype(dtype)
        waiting_alloc_memory = (
-            (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+        )
        gpu_memory_available = core.gpu_memory_available()
        if gpu_memory_available < waiting_alloc_memory:
            t_used = t._copy_to(paddle.CPUPlace(), False)
@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
            output = fused_bias_dropout_residual_ln(x, residual)  # [2, 4, 128]
    """
-    def __init__(self,
+    def __init__(
+        self,
        embed_dim,
        dropout_rate=0.5,
        weight_attr=None,
        bias_attr=None,
        epsilon=1e-5,
-                 name=None):
+        name=None,
+    ):
        super(FusedBiasDropoutResidualLayerNorm, self).__init__()
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+        assert embed_dim > 0, (
-                               "but recieved {}".format(embed_dim))
+            "Expected embed_dim to be greater than 0, "
+            "but recieved {}".format(embed_dim)
+        )
        self._dtype = self._helper.get_default_dtype()
        self._bias_attr = bias_attr
        self._weight_attr = weight_attr
        self.embed_dim = embed_dim
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
            attr=self._bias_attr,
            dtype=self._dtype,
-                                                 is_bias=True)
+            is_bias=True,
+        )
        self.ln_scale = self.create_parameter(
            attr=self._weight_attr,
            shape=[embed_dim],
-            default_initializer=Constant(value=1.0))
+            default_initializer=Constant(value=1.0),
-        self.ln_bias = self.create_parameter(attr=self._bias_attr,
+        )
-                                             shape=[embed_dim],
+        self.ln_bias = self.create_parameter(
-                                             is_bias=True)
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True
+        )
        self.dropout_rate = dropout_rate
        self._epsilon = epsilon
@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
            ln_epsilon=self._epsilon,
            training=self.training,
            mode='upscale_in_train',
-            name=self.name)
+            name=self.name,
+        )
        return out
    def extra_repr(self):
        name_str = ', name={}'.format(self.name) if self.name else ''
        return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
-            self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
+            self.embed_dim,
-            self._dtype, name_str)
+            self.seq_len,
+            self.dropout_rate,
+            self._epsilon,
+            self._dtype,
+            name_str,
+        )
 class FusedMultiHeadAttention(Layer):
@@ -246,7 +263,8 @@ class FusedMultiHeadAttention(Layer):
            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
    """
-    def __init__(self,
+    def __init__(
+        self,
        embed_dim,
        num_heads,
        dropout_rate=0.5,
@@ -266,13 +284,19 @@ class FusedMultiHeadAttention(Layer):
        epsilon=1e-5,
        nranks=1,
        ring_id=-1,
-                 name=None):
+        name=None,
+    ):
        super(FusedMultiHeadAttention, self).__init__()
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+        assert embed_dim > 0, (
-                               "but received {}".format(embed_dim))
+            "Expected embed_dim to be greater than 0, "
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+            "but received {}".format(embed_dim)
-                               "but received {}".format(num_heads))
+        )
+        assert (
+            num_heads > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            num_heads
+        )
        self.normalize_before = normalize_before
        self._dtype = self._helper.get_default_dtype()
@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer):
        self.kdim = kdim
        self.vdim = vdim
        self.need_weights = need_weights
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
        assert need_weights is False, "Only support need_weight is False now."
        # tensor model parallel
@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer):
            shape=[3, num_heads, self.head_dim, embed_dim],
            attr=qkv_weight_attr,
            dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
+        )
        self.qkv_bias = self.create_parameter(
            shape=[3, num_heads, self.head_dim],
            attr=qkv_bias_attr,
            dtype=self._dtype,
-            is_bias=True)
+            is_bias=True,
+        )
        self.linear_weight = self.create_parameter(
            shape=[num_heads * self.head_dim, embed_dim],
            attr=linear_weight_attr,
            dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
+        )
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
            attr=linear_bias_attr,
            dtype=self._dtype,
-                                                 is_bias=True)
+            is_bias=True,
+        )
        # tensor model parallel
        if nranks > 1:
@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer):
            self.pre_ln_scale = self.create_parameter(
                attr=pre_ln_scale_attr,
                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
+                default_initializer=Constant(value=1.0),
-            self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
+            )
-                                                     shape=[embed_dim],
+            self.pre_ln_bias = self.create_parameter(
-                                                     is_bias=True)
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
            self.ln_scale = None
            self.ln_bias = None
        else:
@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer):
            self.ln_scale = self.create_parameter(
                attr=ln_scale_attr,
                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
+                default_initializer=Constant(value=1.0),
-            self.ln_bias = self.create_parameter(attr=ln_bias_attr,
+            )
-                                                 shape=[embed_dim],
+            self.ln_bias = self.create_parameter(
-                                                 is_bias=True)
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
        self.dropout_rate = dropout_rate
        self.attn_dropout_rate = attn_dropout_rate
@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer):
            ln_epsilon=self._epsilon,
            training=self.training,
            ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
        return out
    def extra_repr(self):
        name_str = ', name={}'.format(self.name) if self.name else ''
        return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
-            self.embed_dim, self.num_heads, self.dropout_rate,
+            self.embed_dim,
-            self.attn_dropout_rate, self._epsilon, self.kdim, self.vdim,
+            self.num_heads,
-            self.normalize_before, self.need_weights, self._dtype, name_str)
+            self.dropout_rate,
+            self.attn_dropout_rate,
+            self._epsilon,
+            self.kdim,
+            self.vdim,
+            self.normalize_before,
+            self.need_weights,
+            self._dtype,
+            name_str,
+        )
    def _amp_decorate(self, dtype):
        # tmp fix for amp.decorator(O2)
@@ -495,7 +538,8 @@ class FusedFeedForward(Layer):
            # (1, 8, 8)
    """
-    def __init__(self,
+    def __init__(
+        self,
        d_model,
        dim_feedforward,
        dropout_rate=0.1,
@@ -513,15 +557,20 @@ class FusedFeedForward(Layer):
        ln2_bias_attr=None,
        nranks=1,
        ring_id=-1,
-                 name=None):
+        name=None,
+    ):
        super(FusedFeedForward, self).__init__()
-        assert d_model > 0, (
+        assert (
-            "Expected d_model to be greater than 0, but received {}".format(
+            d_model > 0
-                d_model))
+        ), "Expected d_model to be greater than 0, but received {}".format(
-        assert dim_feedforward > 0, (
+            d_model
-            "Expected dim_feedforward to be greater than 0, but received {}".
+        )
-            format(dim_feedforward))
+        assert (
+            dim_feedforward > 0
+        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
+            dim_feedforward
+        )
        self._dtype = self._helper.get_default_dtype()
        self._d_model = d_model
@@ -530,7 +579,9 @@ class FusedFeedForward(Layer):
        dim_feedforward = dim_feedforward // nranks
        self._dim_feedforward = dim_feedforward
        self._dropout_rate = dropout_rate
-        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_dropout_rate = (
+            dropout_rate if act_dropout_rate is None else act_dropout_rate
+        )
        self._act_method = activation
        self._normalize_before = normalize_before
        self._epsilon = epsilon
@@ -540,22 +591,28 @@ class FusedFeedForward(Layer):
            shape=[d_model, dim_feedforward],
            attr=linear1_weight_attr,
            dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
-        self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
+        )
+        self._linear1_bias = self.create_parameter(
+            shape=[dim_feedforward],
            attr=linear1_bias_attr,
            dtype=self._dtype,
-                                                   is_bias=True)
+            is_bias=True,
+        )
        self._linear2_weight = self.create_parameter(
            shape=[dim_feedforward, d_model],
            attr=linear2_weight_attr,
            dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
+        )
-        self._linear2_bias = self.create_parameter(shape=[d_model],
+        self._linear2_bias = self.create_parameter(
+            shape=[d_model],
            attr=linear2_bias_attr,
            dtype=self._dtype,
-                                                   is_bias=True)
+            is_bias=True,
+        )
        if nranks > 1:
            assert ring_id != -1
@@ -569,10 +626,11 @@ class FusedFeedForward(Layer):
                shape=[d_model],
                attr=ln1_scale_attr,
                is_bias=False,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
-            self._ln1_bias = self.create_parameter(shape=[d_model],
+            )
-                                                   attr=ln1_bias_attr,
+            self._ln1_bias = self.create_parameter(
-                                                   is_bias=True)
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True
+            )
            self._ln2_scale = None
            self._ln2_bias = None
        else:
@@ -582,10 +640,11 @@ class FusedFeedForward(Layer):
                shape=[d_model],
                attr=ln2_scale_attr,
                is_bias=False,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
-            self._ln2_bias = self.create_parameter(shape=[d_model],
+            )
-                                                   attr=ln2_bias_attr,
+            self._ln2_bias = self.create_parameter(
-                                                   is_bias=True)
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True
+            )
        self.name = name
@@ -608,15 +667,23 @@ class FusedFeedForward(Layer):
            pre_layer_norm=self._normalize_before,
            training=self.training,
            ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
        return out
    def extra_repr(self):
        name_str = ', name={}'.format(self.name) if self.name else ''
        return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
-            self._d_model, self._dim_feedforward, self._dropout_rate,
+            self._d_model,
-            self._epsilon, self._act_method, self._act_dropout_rate,
+            self._dim_feedforward,
-            self._normalize_before, self._dtype, name_str)
+            self._dropout_rate,
+            self._epsilon,
+            self._act_method,
+            self._act_dropout_rate,
+            self._normalize_before,
+            self._dtype,
+            name_str,
+        )
    def _amp_decorate(self, dtype):
        # tmp fix for amp.decorator(O2)
@@ -640,6 +707,7 @@ class FusedFeedForward(Layer):
 class FusedTransformerEncoderLayer(Layer):
    """
    FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
    attention and feedforward network. Before and after each sub-layer, pre-process
    and post-precess would be applied on the input and output accordingly. If
@@ -681,7 +749,6 @@ class FusedTransformerEncoderLayer(Layer):
    Examples:
        .. code-block:: python
            # required: gpu
@@ -694,9 +761,11 @@ class FusedTransformerEncoderLayer(Layer):
            attn_mask = paddle.rand((2, 2, 4, 4))
            encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
    """
-    def __init__(self,
+    def __init__(
+        self,
        d_model,
        nhead,
        dim_feedforward,
@@ -706,21 +775,33 @@ class FusedTransformerEncoderLayer(Layer):
        act_dropout_rate=None,
        normalize_before=False,
        weight_attr=None,
-                 bias_attr=None):
+        bias_attr=None,
+    ):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3
        super(FusedTransformerEncoderLayer, self).__init__()
-        assert d_model > 0, ("Expected d_model to be greater than 0, "
+        assert (
-                             "but received {}".format(d_model))
+            d_model > 0
-        assert nhead > 0, ("Expected nhead to be greater than 0, "
+        ), "Expected d_model to be greater than 0, " "but received {}".format(
-                           "but received {}".format(nhead))
+            d_model
+        )
+        assert (
+            nhead > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            nhead
+        )
        assert dim_feedforward > 0, (
            "Expected dim_feedforward to be greater than 0, "
-            "but received {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward)
-        attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
+        )
-        act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        attn_dropout_rate = (
+            dropout_rate if attn_dropout_rate is None else attn_dropout_rate
+        )
+        act_dropout_rate = (
+            dropout_rate if act_dropout_rate is None else act_dropout_rate
+        )
        self.normalize_before = normalize_before
        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
@@ -739,9 +820,11 @@ class FusedTransformerEncoderLayer(Layer):
            pre_ln_scale_attr=weight_attrs[0],
            pre_ln_bias_attr=bias_attrs[0],
            ln_scale_attr=weight_attrs[0],
-            ln_bias_attr=bias_attrs[0])
+            ln_bias_attr=bias_attrs[0],
+        )
-        self.ffn = FusedFeedForward(d_model,
+        self.ffn = FusedFeedForward(
+            d_model,
            dim_feedforward,
            dropout_rate=dropout_rate,
            activation=activation,
@@ -750,11 +833,14 @@ class FusedTransformerEncoderLayer(Layer):
            linear1_weight_attr=weight_attrs[1],
            linear1_bias_attr=bias_attrs[1],
            linear2_weight_attr=weight_attrs[1],
-                                    linear2_bias_attr=bias_attrs[1])
+            linear2_bias_attr=bias_attrs[1],
+        )
    def forward(self, src, src_mask=None, cache=None):
        """
        Applies a Transformer encoder layer on the input.
        Parameters:
            src (Tensor): The input of Transformer encoder layer. It is
                a tensor with shape `[batch_size, sequence_length, d_model]`.
@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer):
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
-                See `TransformerEncoderLayer.gen_cache` for more details. It is
+                See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
                only used for inference and should be None for training. Default
                None.
        Returns:
-            Tensor|tuple: It is a tensor that has the same shape and data type \
+            Tensor|tuple, It is a tensor that has the same shape and data type \
                as `enc_input`, representing the output of Transformer encoder \
                layer. Or a tuple if `cache` is not None, except for encoder \
                layer output, the tuple includes the new cache which is same \
                as input `cache` argument but `incremental_cache` has an \
                incremental length. See `MultiHeadAttention.gen_cache` and \
                `MultiHeadAttention.forward` for more details.
        """
        src_mask = _convert_attention_mask(src_mask, src.dtype)
        if cache is None:
            attn_out = self.fused_attn(src, attn_mask=src_mask)
        else:
-            attn_out, incremental_cache = self.fused_attn(src,
+            attn_out, incremental_cache = self.fused_attn(
-                                                          attn_mask=src_mask,
+                src, attn_mask=src_mask, cache=cache
-                                                          cache=cache)
+            )
        ffn_out = self.ffn(attn_out)
@@ -889,7 +977,8 @@ class FusedTransformer(Layer):
                                 cross_attn_mask)  # [2, 6, 128]
    """
-    def __init__(self,
+    def __init__(
+        self,
        d_model=512,
        nhead=8,
        num_encoder_layers=6,
@@ -903,7 +992,8 @@ class FusedTransformer(Layer):
        weight_attr=None,
        bias_attr=None,
        custom_encoder=None,
-                 custom_decoder=None):
+        custom_decoder=None,
+    ):
        super(fusedTransformer, self).__init__()
        raise NotImplementedError()
@@ -1071,7 +1161,8 @@ class FusedMultiTransformer(Layer):
            enc_output = encoder_layers(enc_input, attn_mask)  # [2, 4, 128]
    """
-    def __init__(self,
+    def __init__(
+        self,
        embed_dim,
        num_heads,
        dim_feedforward,
@@ -1095,16 +1186,24 @@ class FusedMultiTransformer(Layer):
        nranks=1,
        trans_qkvw=True,
        ring_id=-1,
-                 name=None):
+        name=None,
+    ):
        super(FusedMultiTransformer, self).__init__()
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+        assert embed_dim > 0, (
-                               "but received {}".format(embed_dim))
+            "Expected embed_dim to be greater than 0, "
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+            "but received {}".format(embed_dim)
-                               "but received {}".format(num_heads))
+        )
-        assert dim_feedforward > 0, (
+        assert (
-            "Expected dim_feedforward to be greater than 0, but received {}".
+            num_heads > 0
-            format(dim_feedforward))
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            num_heads
+        )
+        assert (
+            dim_feedforward > 0
+        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
+            dim_feedforward
+        )
        self.normalize_before = normalize_before
        self._dtype = self._helper.get_default_dtype()
@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer):
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
        # tensor model parallel
        if nranks > 1:
@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer):
            ln_scale = self.create_parameter(
                attr=ln_scale_attr,
                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
+                default_initializer=Constant(value=1.0),
-            ln_bias = self.create_parameter(attr=ln_bias_attr,
+            )
-                                            shape=[embed_dim],
+            ln_bias = self.create_parameter(
-                                            is_bias=True)
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
            qkv_weight = self.create_parameter(
                shape=[3, num_heads, self.head_dim, embed_dim]
-                if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim],
+                if trans_qkvw
+                else [embed_dim, 3, num_heads, self.head_dim],
                attr=qkv_weight_attr,
                dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
+            )
            qkv_bias = self.create_parameter(
                shape=[3, num_heads, self.head_dim],
                attr=qkv_bias_attr,
                dtype=self._dtype,
-                is_bias=True)
+                is_bias=True,
+            )
            linear_weight = self.create_parameter(
                shape=[num_heads * self.head_dim, embed_dim],
                attr=linear_weight_attr,
                dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
-            linear_bias = self.create_parameter(shape=[embed_dim],
+            )
+            linear_bias = self.create_parameter(
+                shape=[embed_dim],
                attr=linear_bias_attr,
                dtype=self._dtype,
-                                                is_bias=True)
+                is_bias=True,
+            )
            ffn_ln_scale = self.create_parameter(
                shape=[embed_dim],
                attr=ffn_ln_scale_attr,
                is_bias=False,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
-            ffn_ln_bias = self.create_parameter(shape=[embed_dim],
+            )
-                                                attr=ffn_ln_bias_attr,
+            ffn_ln_bias = self.create_parameter(
-                                                is_bias=True)
+                shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True
+            )
            ffn1_weight = self.create_parameter(
                shape=[embed_dim, dim_feedforward],
                attr=ffn1_weight_attr,
                dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
-            ffn1_bias = self.create_parameter(shape=[dim_feedforward],
+            )
+            ffn1_bias = self.create_parameter(
+                shape=[dim_feedforward],
                attr=ffn1_bias_attr,
                dtype=self._dtype,
-                                              is_bias=True)
+                is_bias=True,
+            )
            ffn2_weight = self.create_parameter(
                shape=[dim_feedforward, embed_dim],
                attr=ffn2_weight_attr,
                dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
-            ffn2_bias = self.create_parameter(shape=[embed_dim],
+            )
+            ffn2_bias = self.create_parameter(
+                shape=[embed_dim],
                attr=ffn2_bias_attr,
                dtype=self._dtype,
-                                              is_bias=True)
+                is_bias=True,
+            )
            # tensor model parallel
            if nranks > 1:
@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer):
            mode='upscale_in_train',
            trans_qkvw=self._trans_qkvw,
            ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
        return out
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -20,14 +20,17 @@ from paddle.fluid import core
 from paddle import _C_ops, _legacy_C_ops
-def graph_khop_sampler(row,
+def graph_khop_sampler(
+    row,
    colptr,
    input_nodes,
    sample_sizes,
    sorted_eids=None,
    return_eids=False,
-                       name=None):
+    name=None,
+):
    """
    Graph Khop Sampler API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -50,24 +53,23 @@ def graph_khop_sampler(row,
        sample_sizes (list|tuple): The number of neighbors and number of layers we want
                                   to sample. The data type should be int, and the shape
                                   should only have one dimension.
-        sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids`
+        sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
                              is True. The shape should be [num_edges, 1], and the data
-                              type should be the same with `row`.
+                              type should be the same with `row`. Default is None.
-        return_eids (bool): Whether to return the id of the sample edges. Default is False.
+        return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        edge_src (Tensor): The src index of the output edges, also means the first column of 
+        - edge_src (Tensor), The src index of the output edges, also means the first column of
          the edges. The shape is [num_sample_edges, 1] currently.
-        edge_dst (Tensor): The dst index of the output edges, also means the second column
+        - edge_dst (Tensor), The dst index of the output edges, also means the second column
          of the edges. The shape is [num_sample_edges, 1] currently.
-        sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes.
+        - sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
-        reindex_nodes (Tensor): The reindex id of the input nodes.
+        - reindex_nodes (Tensor), The reindex id of the input nodes.
-        edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True.
+        - edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
    Examples:
        .. code-block:: python
            import paddle
@@ -80,44 +82,72 @@ def graph_khop_sampler(row,
            colptr = paddle.to_tensor(colptr, dtype="int64")
            nodes = paddle.to_tensor(nodes, dtype="int64")
-        edge_src, edge_dst, sample_index, reindex_nodes = \
+            edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
-            paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
    """
    if _non_static_mode():
        if return_eids:
            if sorted_eids is None:
-                raise ValueError(f"`sorted_eid` should not be None "
+                raise ValueError(
-                                 f"if return_eids is True.")
+                    f"`sorted_eid` should not be None "
-            edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
+                    f"if return_eids is True."
-                _legacy_C_ops.graph_khop_sampler(row, sorted_eids,
+                )
-                                              colptr, input_nodes,
+            (
-                                              "sample_sizes", sample_sizes,
+                edge_src,
-                                              "return_eids", True)
+                edge_dst,
+                sample_index,
+                reindex_nodes,
+                edge_eids,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                sorted_eids,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                True,
+            )
            return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
        else:
-            edge_src, edge_dst, sample_index, reindex_nodes, _ = \
+            (
-                _legacy_C_ops.graph_khop_sampler(row, None,
+                edge_src,
-                                              colptr, input_nodes,
+                edge_dst,
-                                              "sample_sizes", sample_sizes,
+                sample_index,
-                                              "return_eids", False)
+                reindex_nodes,
+                _,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                None,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                False,
+            )
            return edge_src, edge_dst, sample_index, reindex_nodes
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_khop_sampler")
+        row, "Row", ("int32", "int64"), "graph_khop_sampler"
+    )
    if return_eids:
        if sorted_eids is None:
-            raise ValueError(f"`sorted_eid` should not be None "
+            raise ValueError(
-                             f"if return_eids is True.")
+                f"`sorted_eid` should not be None " f"if return_eids is True."
-        check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"),
+            )
-                                 "graph_khop_sampler")
+        check_variable_and_dtype(
+            sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
+        )
-                             "graph_khop_sampler")
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_khop_sampler")
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
+    )
    helper = LayerHelper("graph_khop_sampler", **locals())
    edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
    sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
    reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
    edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_khop_sampler",
+    helper.append_op(
+        type="graph_khop_sampler",
        inputs={
            "Row": row,
            "Eids": sorted_eids,
            "Col_Ptr": colptr,
-                         "X": input_nodes
+            "X": input_nodes,
        },
        outputs={
            "Out_Src": edge_src,
            "Out_Dst": edge_dst,
            "Sample_Index": sample_index,
            "Reindex_X": reindex_nodes,
-                         "Out_Eids": edge_eids
+            "Out_Eids": edge_eids,
        },
-                     attrs={
+        attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
-                         "sample_sizes": sample_sizes,
+    )
-                         "return_eids": return_eids
-                     })
    if return_eids:
        return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
    else:

--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops
 import paddle.utils.deprecated as deprecated
-@deprecated(since="2.4.0",
+@deprecated(
+    since="2.4.0",
    update_to="paddle.geometric.reindex_graph",
    level=1,
-            reason="paddle.incubate.graph_reindex will be removed in future")
+    reason="paddle.incubate.graph_reindex will be removed in future",
-def graph_reindex(x,
+)
+def graph_reindex(
+    x,
    neighbors,
    count,
    value_buffer=None,
    index_buffer=None,
    flag_buffer_hashtable=False,
-                  name=None):
+    name=None,
+):
    """
    Graph Reindex API.
    This API is mainly used in Graph Learning domain, which should be used
@@ -40,7 +45,7 @@ def graph_reindex(x,
    is to reindex the ids information of the input nodes, and return the 
    corresponding graph edges after reindex.
-    **Notes**: 
+    Notes:
        The number in x should be unique, otherwise it would cause potential errors.
        Besides, we also support multi-edge-types neighbors reindexing. If we have different
        edge_type neighbors for x, we should concatenate all the neighbors and count of x.
@@ -58,24 +63,23 @@ def graph_reindex(x,
                            should be the same with `x`.
        count (Tensor): The neighbor count of the input nodes `x`. And the 
                        data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should 
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
-                                    be int32, and should be filled with -1.
+                                    be int32, and should be filled with -1. Default is None.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should 
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
-                                    be int32, and should be filled with -1.
+                                    be int32, and should be filled with -1. Default is None.
-        flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up.
+        flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
                                      Default is False. Only useful for gpu version currently.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        reindex_src (Tensor): The source node index of graph edges after reindex.
+        - reindex_src (Tensor), The source node index of graph edges after reindex.
-        reindex_dst (Tensor): The destination node index of graph edges after reindex.
+        - reindex_dst (Tensor), The destination node index of graph edges after reindex.
-        out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
+        - out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
          where we put the input nodes `x` in the front, and put neighbor
          nodes in the back.
    Examples:
        .. code-block:: python
            import paddle
@@ -109,47 +113,55 @@ def graph_reindex(x,
    """
    if flag_buffer_hashtable:
        if value_buffer is None or index_buffer is None:
-            raise ValueError(f"`value_buffer` and `index_buffer` should not"
+            raise ValueError(
-                             "be None if `flag_buffer_hashtable` is True.")
+                f"`value_buffer` and `index_buffer` should not"
+                "be None if `flag_buffer_hashtable` is True."
+            )
    if _non_static_mode():
-        reindex_src, reindex_dst, out_nodes = \
+        reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
-            _legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
+            x,
-                                 "flag_buffer_hashtable", flag_buffer_hashtable)
+            neighbors,
+            count,
+            value_buffer,
+            index_buffer,
+            "flag_buffer_hashtable",
+            flag_buffer_hashtable,
+        )
        return reindex_src, reindex_dst, out_nodes
    check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
-    check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_reindex")
+        neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
+    )
    check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
    if flag_buffer_hashtable:
-        check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
+        check_variable_and_dtype(
-                                 "graph_reindex")
+            value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
-        check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
+        )
-                                 "graph_reindex")
+        check_variable_and_dtype(
+            index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
+        )
    helper = LayerHelper("graph_reindex", **locals())
    reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
    reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
    out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="graph_reindex",
+    helper.append_op(
+        type="graph_reindex",
        inputs={
-                         "X":
+            "X": x,
-                         x,
+            "Neighbors": neighbors,
-                         "Neighbors":
+            "Count": count,
-                         neighbors,
+            "HashTable_Value": value_buffer if flag_buffer_hashtable else None,
-                         "Count":
+            "HashTable_Index": index_buffer if flag_buffer_hashtable else None,
-                         count,
-                         "HashTable_Value":
-                         value_buffer if flag_buffer_hashtable else None,
-                         "HashTable_Index":
-                         index_buffer if flag_buffer_hashtable else None,
        },
        outputs={
            "Reindex_Src": reindex_src,
            "Reindex_Dst": reindex_dst,
-                         "Out_Nodes": out_nodes
+            "Out_Nodes": out_nodes,
        },
-                     attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
+        attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
+    )
    return reindex_src, reindex_dst, out_nodes
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -25,8 +25,10 @@ import paddle.utils.deprecated as deprecated
    since="2.4.0",
    update_to="paddle.geometric.sample_neighbors",
    level=1,
-    reason="paddle.incubate.graph_sample_neighbors will be removed in future")
+    reason="paddle.incubate.graph_sample_neighbors will be removed in future",
-def graph_sample_neighbors(row,
+)
+def graph_sample_neighbors(
+    row,
    colptr,
    input_nodes,
    eids=None,
@@ -34,8 +36,10 @@ def graph_sample_neighbors(row,
    sample_size=-1,
    return_eids=False,
    flag_perm_buffer=False,
-                           name=None):
+    name=None,
+):
    """
    Graph Sample Neighbors API.
    This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -71,14 +75,13 @@ def graph_sample_neighbors(row,
                              For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        out_neighbors (Tensor): The sample neighbors of the input nodes.
+        - out_neighbors (Tensor), The sample neighbors of the input nodes.
-        out_count (Tensor): The number of sampling neighbors of each input node, and the shape
+        - out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
-                            should be the same with `input_nodes`.
+        - out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the 
-                           sample edges.
    Examples:
        .. code-block:: python
            import paddle
            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
@@ -98,59 +101,83 @@ def graph_sample_neighbors(row,
    if return_eids:
        if eids is None:
            raise ValueError(
-                f"`eids` should not be None if `return_eids` is True.")
+                f"`eids` should not be None if `return_eids` is True."
+            )
    if flag_perm_buffer:
        if perm_buffer is None:
            raise ValueError(
                f"`perm_buffer` should not be None if `flag_perm_buffer`"
-                "is True.")
+                "is True."
+            )
    if _non_static_mode():
-        out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
+        (
-            row, colptr, input_nodes, eids, perm_buffer, "sample_size",
+            out_neighbors,
-            sample_size, "return_eids", return_eids, "flag_perm_buffer",
+            out_count,
-            flag_perm_buffer)
+            out_eids,
+        ) = _legacy_C_ops.graph_sample_neighbors(
+            row,
+            colptr,
+            input_nodes,
+            eids,
+            perm_buffer,
+            "sample_size",
+            sample_size,
+            "return_eids",
+            return_eids,
+            "flag_perm_buffer",
+            flag_perm_buffer,
+        )
        if return_eids:
            return out_neighbors, out_count, out_eids
        return out_neighbors, out_count
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
+    check_variable_and_dtype(
-                             "graph_sample_neighbors")
+        row, "Row", ("int32", "int64"), "graph_sample_neighbors"
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
+    )
-                             "graph_sample_neighbors")
+    check_variable_and_dtype(
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
-                             "graph_sample_neighbors")
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
+    )
    if return_eids:
-        check_variable_and_dtype(eids, "Eids", ("int32", "int64"),
+        check_variable_and_dtype(
-                                 "graph_sample_neighbors")
+            eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
+        )
    if flag_perm_buffer:
-        check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"),
+        check_variable_and_dtype(
-                                 "graph_sample_neighbors")
+            perm_buffer,
+            "Perm_Buffer",
+            ("int32", "int64"),
+            "graph_sample_neighbors",
+        )
    helper = LayerHelper("graph_sample_neighbors", **locals())
    out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
    out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
    out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_sample_neighbors",
+    helper.append_op(
+        type="graph_sample_neighbors",
        inputs={
            "Row": row,
            "Col_Ptr": colptr,
            "X": input_nodes,
            "Eids": eids if return_eids else None,
-                         "Perm_Buffer":
+            "Perm_Buffer": perm_buffer if flag_perm_buffer else None,
-                         perm_buffer if flag_perm_buffer else None
        },
        outputs={
            "Out": out_neighbors,
            "Out_Count": out_count,
-                         "Out_Eids": out_eids
+            "Out_Eids": out_eids,
        },
        attrs={
            "sample_size": sample_size,
            "return_eids": return_eids,
-                         "flag_perm_buffer": flag_perm_buffer
+            "flag_perm_buffer": flag_perm_buffer,
-                     })
+        },
+    )
    if return_eids:
        return out_neighbors, out_count, out_eids
    return out_neighbors, out_count
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
@@ -36,7 +36,8 @@ from paddle import _C_ops, _legacy_C_ops
 __all__ = ['resnet_basic_block', 'ResNetBasicBlock']
-def resnet_basic_block(x,
+def resnet_basic_block(
+    x,
    filter1,
    scale1,
    bias1,
@@ -69,73 +70,198 @@ def resnet_basic_block(x,
    use_global_stats=None,
    training=False,
    trainable_statistics=False,
-                       find_conv_max=True):
+    find_conv_max=True,
+):
    if fluid.framework.in_dygraph_mode():
-        attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3,
+        attrs = (
-                 'padding1', padding1, 'padding2', padding2, 'padding3',
+            'stride1',
-                 padding3, 'dilation1', dilation1, 'dilation2', dilation2,
+            stride1,
-                 'dilation3', dilation3, 'group', groups, 'momentum', momentum,
+            'stride2',
-                 'epsilon', eps, 'data_format', data_format, 'has_shortcut',
+            stride2,
-                 has_shortcut, 'use_global_stats', use_global_stats,
+            'stride3',
-                 "trainable_statistics", trainable_statistics, 'is_test',
+            stride3,
-                 not training, 'act_type', "relu", 'find_conv_input_max',
+            'padding1',
-                 find_conv_max)
+            padding1,
+            'padding2',
-        out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \
+            padding2,
-                getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \
+            'padding3',
-                filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs)
+            padding3,
+            'dilation1',
+            dilation1,
+            'dilation2',
+            dilation2,
+            'dilation3',
+            dilation3,
+            'group',
+            groups,
+            'momentum',
+            momentum,
+            'epsilon',
+            eps,
+            'data_format',
+            data_format,
+            'has_shortcut',
+            has_shortcut,
+            'use_global_stats',
+            use_global_stats,
+            "trainable_statistics",
+            trainable_statistics,
+            'is_test',
+            not training,
+            'act_type',
+            "relu",
+            'find_conv_input_max',
+            find_conv_max,
+        )
+        (
+            out,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+        ) = getattr(_C_ops, "resnet_basic_block")(
+            x,
+            filter1,
+            scale1,
+            bias1,
+            mean1,
+            var1,
+            filter2,
+            scale2,
+            bias2,
+            mean2,
+            var2,
+            filter3,
+            scale3,
+            bias3,
+            mean3,
+            var3,
+            mean1,
+            var1,
+            mean2,
+            var2,
+            mean3,
+            var3,
+            *attrs
+        )
        return out
    helper = LayerHelper('resnet_basic_block', **locals())
    bn_param_dtype = fluid.core.VarDesc.VarType.FP32
    max_dtype = fluid.core.VarDesc.VarType.FP32
-    out = helper.create_variable_for_type_inference(dtype=x.dtype,
+    out = helper.create_variable_for_type_inference(
-                                                    stop_gradient=True)
+        dtype=x.dtype, stop_gradient=True
-    conv1 = helper.create_variable_for_type_inference(dtype=x.dtype,
+    )
-                                                      stop_gradient=True)
+    conv1 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
    saved_mean1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
    saved_invstd1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
-    running_mean1 = helper.create_variable_for_type_inference(
+    )
-        dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1
+    running_mean1 = (
-    running_var1 = helper.create_variable_for_type_inference(
+        helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1
+            dtype=bn_param_dtype, stop_gradient=True
-    conv2 = helper.create_variable_for_type_inference(dtype=x.dtype,
+        )
-                                                      stop_gradient=True)
+        if mean1 is None
-    conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype,
+        else mean1
-                                                            stop_gradient=True)
+    )
+    running_var1 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var1 is None
+        else var1
+    )
+    conv2 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    conv2_input = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
    saved_mean2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
    saved_invstd2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
-    running_mean2 = helper.create_variable_for_type_inference(
+    )
-        dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2
+    running_mean2 = (
-    running_var2 = helper.create_variable_for_type_inference(
+        helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2
+            dtype=bn_param_dtype, stop_gradient=True
-    conv3 = helper.create_variable_for_type_inference(dtype=x.dtype,
+        )
-                                                      stop_gradient=True)
+        if mean2 is None
+        else mean2
+    )
+    running_var2 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var2 is None
+        else var2
+    )
+    conv3 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
    saved_mean3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
    saved_invstd3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
-    running_mean3 = helper.create_variable_for_type_inference(
+    )
-        dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3
+    running_mean3 = (
-    running_var3 = helper.create_variable_for_type_inference(
+        helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if mean3 is None
+        else mean3
+    )
+    running_var3 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var3 is None
+        else var3
+    )
    conv1_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv1_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv2_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv2_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv3_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    conv3_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
    inputs = {
        'X': x,
@@ -175,7 +301,7 @@ def resnet_basic_block(x,
        "trainable_statistics": trainable_statistics,
        'is_test': not training,
        'act_type': "relu",
-        'find_conv_input_max': find_conv_max
+        'find_conv_input_max': find_conv_max,
    }
    outputs = {
@@ -203,39 +329,120 @@ def resnet_basic_block(x,
        'MaxInput3': conv3_input_max,
        'MaxFilter3': conv3_filter_max,
    }
-    helper.append_op(type='resnet_basic_block',
+    helper.append_op(
-                     inputs=inputs,
+        type='resnet_basic_block', inputs=inputs, outputs=outputs, attrs=attrs
-                     outputs=outputs,
+    )
-                     attrs=attrs)
    return out
 class ResNetBasicBlock(Layer):
-    """
+    r"""
    ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
-    The fusion op architecture like this:
+    If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time.
-            has_shortcut = True:       else:
+    If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this
-                    X                         X
+    case the shape of output is same with input.
-                  /                         /
-                |       |                 |       |
-              CONV1     |               CONV1     |
+    Args:
-                |       |                 |       |
+        num_channels (int): The number of input image channel.
-               BN1      |                BN1      |
+        num_filter (int): The number of filter. It is as same as the output image channel.
-                |       |                 |       |
+        filter_size (int|list|tuple): The filter size. If filter_size
-              RELU1     |               RELU1     |
+            is a tuple, it must contain two integers, (filter_size_height,
-                |       |                 |       |
+            filter_size_width). Otherwise, filter_size_height = filter_size_width =\
-              CONV2   CONV3             CONV2     |
+            filter_size.
-                |       |                 |       |
+        stride (int, optional): The stride size. It means the stride in convolution.
-               BN2     BN3               BN2      |
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width).
-                 \     /                   \     /
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-                   ADD                       ADD
+        act (str, optional): Activation type, if it is set to None, activation is not appended.
-                    |                         |
+            Default: None
-                   RELU                      RELU
+        momentum (float, optional): The value used for the moving_mean and
-                    |                         |
+            moving_var computation. This should be a float number or a Tensor with
-                    Y                         Y
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. Now is only support `"NCHW"`, the data is stored in
+            the order of: `[batch_size, input_channels, input_height, input_width]`.
+        has_shortcut (bool, optional): Whether to calculate CONV3 and BN3. Default: False.
+        use_global_stats (bool, optional): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        is_test (bool, optional): A flag indicating whether it is in
+            test phrase or not. Default: False.
+        filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. Default: None.
+        scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr
+            as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set,
+            the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero.
+            Default: None.
+        moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string. Default: None.
+        moving_var_name (str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string. Default: None.
+        padding (int, optional): The padding size. It is only spupport padding_height = padding_width = padding.
+            Default: padding = 0.
+        dilation (int, optional): The dilation size. It means the spacing between the kernel
+            points. It is only spupport dilation_height = dilation_width = dilation.
+            Default: dilation = 1.
+        trainable_statistics (bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
+            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
+            Default: False.
+        find_conv_max (bool, optional): Whether to calculate max value of each conv2d. Default: True.
+    Returns:
+        A Tensor representing the ResNetBasicBlock, whose data type is the same with input.
+    Examples:
+        .. code-block:: python
+            # required: xpu
+            import paddle
+            from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
+            ch_in = 4
+            ch_out = 8
+            x = paddle.uniform((2, ch_in, 16, 16), dtype='float32', min=-1., max=1.)
+            resnet_basic_block = ResNetBasicBlock(num_channels1=ch_in,
+                                                num_filter1=ch_out,
+                                                filter1_size=3,
+                                                num_channels2=ch_out,
+                                                num_filter2=ch_out,
+                                                filter2_size=3,
+                                                num_channels3=ch_in,
+                                                num_filter3=ch_out,
+                                                filter3_size=1,
+                                                stride1=1,
+                                                stride2=1,
+                                                stride3=1,
+                                                act='relu',
+                                                padding1=1,
+                                                padding2=1,
+                                                padding3=0,
+                                                has_shortcut=True)
+            out = resnet_basic_block.forward(x)
+            print(out.shape) # [2, 8, 16, 16]
    """
-    def __init__(self,
+    def __init__(
+        self,
        num_channels1,
        num_filter1,
        filter1_size,
@@ -277,14 +484,17 @@ class ResNetBasicBlock(Layer):
        dilation2=1,
        dilation3=1,
        trainable_statistics=False,
-                 find_conv_max=True):
+        find_conv_max=True,
+    ):
        super(ResNetBasicBlock, self).__init__()
        self._stride1 = stride1
        self._stride2 = stride2
-        self._kernel1_size = utils.convert_to_list(filter1_size, 2,
+        self._kernel1_size = utils.convert_to_list(
-                                                   'filter1_size')
+            filter1_size, 2, 'filter1_size'
-        self._kernel2_size = utils.convert_to_list(filter2_size, 2,
+        )
-                                                   'filter2_size')
+        self._kernel2_size = utils.convert_to_list(
+            filter2_size, 2, 'filter2_size'
+        )
        self._dilation1 = dilation1
        self._dilation2 = dilation2
        self._padding1 = padding1
@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer):
        self._find_conv_max = find_conv_max
        if has_shortcut:
-            self._kernel3_size = utils.convert_to_list(filter3_size, 2,
+            self._kernel3_size = utils.convert_to_list(
-                                                       'filter3_size')
+                filter3_size, 2, 'filter3_size'
+            )
            self._padding3 = padding3
            self._stride3 = stride3
            self._dilation3 = dilation3
@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer):
        if data_format not in valid_format:
            raise ValueError(
                "conv_format must be one of {}, but got conv_format={}".format(
-                    valid_format, data_format))
+                    valid_format, data_format
+                )
+            )
        def _get_default_param_initializer(channels, kernel_size):
            filter_elem_num = np.prod(kernel_size) * channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
            return I.Normal(0.0, std)
        # init filter
@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer):
            shape=filter1_shape,
            attr=filter1_attr,
            default_initializer=_get_default_param_initializer(
-                num_channels1, self._kernel1_size))
+                num_channels1, self._kernel1_size
+            ),
+        )
        self.scale_1 = self.create_parameter(
            shape=bn1_param_shape,
            attr=scale1_attr,
            dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0))
+            default_initializer=I.Constant(1.0),
-        self.bias_1 = self.create_parameter(shape=bn1_param_shape,
+        )
+        self.bias_1 = self.create_parameter(
+            shape=bn1_param_shape,
            attr=bias1_attr,
            dtype=bn_param_dtype,
-                                            is_bias=True)
+            is_bias=True,
-        self.mean_1 = self.create_parameter(attr=ParamAttr(
+        )
+        self.mean_1 = self.create_parameter(
+            attr=ParamAttr(
                name=moving_mean1_name,
                initializer=I.Constant(0.0),
-            trainable=False),
+                trainable=False,
+            ),
            shape=bn1_param_shape,
-                                            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
        self.mean_1.stop_gradient = True
        self.var_1 = self.create_parameter(
-            attr=ParamAttr(name=moving_var1_name,
+            attr=ParamAttr(
+                name=moving_var1_name,
                initializer=I.Constant(1.0),
-                           trainable=False),
+                trainable=False,
+            ),
            shape=bn1_param_shape,
-            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
        self.var_1.stop_gradient = True
        self.filter_2 = self.create_parameter(
            shape=filter2_shape,
            attr=filter2_attr,
            default_initializer=_get_default_param_initializer(
-                num_channels2, self._kernel2_size))
+                num_channels2, self._kernel2_size
+            ),
+        )
        self.scale_2 = self.create_parameter(
            shape=bn2_param_shape,
            attr=scale2_attr,
            dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0))
+            default_initializer=I.Constant(1.0),
-        self.bias_2 = self.create_parameter(shape=bn2_param_shape,
+        )
+        self.bias_2 = self.create_parameter(
+            shape=bn2_param_shape,
            attr=bias2_attr,
            dtype=bn_param_dtype,
-                                            is_bias=True)
+            is_bias=True,
-        self.mean_2 = self.create_parameter(attr=ParamAttr(
+        )
+        self.mean_2 = self.create_parameter(
+            attr=ParamAttr(
                name=moving_mean2_name,
                initializer=I.Constant(0.0),
-            trainable=False),
+                trainable=False,
+            ),
            shape=bn2_param_shape,
-                                            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
        self.mean_2.stop_gradient = True
        self.var_2 = self.create_parameter(
-            attr=ParamAttr(name=moving_var2_name,
+            attr=ParamAttr(
+                name=moving_var2_name,
                initializer=I.Constant(1.0),
-                           trainable=False),
+                trainable=False,
+            ),
            shape=bn2_param_shape,
-            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
        self.var_2.stop_gradient = True
        if has_shortcut:
            bn3_param_shape = [1, 1, num_filter3]
            filter3_shape = [
-                num_filter3, num_channels3, filter3_size, filter3_size
+                num_filter3,
+                num_channels3,
+                filter3_size,
+                filter3_size,
            ]
            self.filter_3 = self.create_parameter(
                shape=filter3_shape,
                attr=filter3_attr,
                default_initializer=_get_default_param_initializer(
-                    num_channels3, self._kernel3_size))
+                    num_channels3, self._kernel3_size
+                ),
+            )
            self.scale_3 = self.create_parameter(
                shape=bn3_param_shape,
                attr=scale3_attr,
                dtype=bn_param_dtype,
-                default_initializer=I.Constant(1.0))
+                default_initializer=I.Constant(1.0),
-            self.bias_3 = self.create_parameter(shape=bn3_param_shape,
+            )
+            self.bias_3 = self.create_parameter(
+                shape=bn3_param_shape,
                attr=bias3_attr,
                dtype=bn_param_dtype,
-                                                is_bias=True)
+                is_bias=True,
-            self.mean_3 = self.create_parameter(attr=ParamAttr(
+            )
+            self.mean_3 = self.create_parameter(
+                attr=ParamAttr(
                    name=moving_mean3_name,
                    initializer=I.Constant(0.0),
-                trainable=False),
+                    trainable=False,
+                ),
                shape=bn3_param_shape,
-                                                dtype=bn_param_dtype)
+                dtype=bn_param_dtype,
+            )
            self.mean_3.stop_gradient = True
-            self.var_3 = self.create_parameter(attr=ParamAttr(
+            self.var_3 = self.create_parameter(
+                attr=ParamAttr(
                    name=moving_var3_name,
                    initializer=I.Constant(1.0),
-                trainable=False),
+                    trainable=False,
+                ),
                shape=bn3_param_shape,
-                                               dtype=bn_param_dtype)
+                dtype=bn_param_dtype,
+            )
            self.var_3.stop_gradient = True
        else:
            self.filter_3 = None
@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer):
            use_global_stats=self._use_global_stats,
            training=self.training,
            trainable_statistics=self._trainable_statistics,
-            find_conv_max=self._find_conv_max)
+            find_conv_max=self._find_conv_max,
+        )
        return out
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -715,6 +715,7 @@ def upsample(
    name=None,
 ):
    """
    This API resizes a batch of images.
    The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
@@ -725,11 +726,12 @@ def upsample(
    and the resizing only applies on the three dimensions(depth, height and width).
    Supporting resample methods:
-        'linear' : Linear interpolation
+    - 'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
+    - 'bilinear' : Bilinear interpolation
-        'trilinear' : Trilinear interpolation
+    - 'trilinear' : Trilinear interpolation
-        'nearest' : Nearest neighbor interpolation
+    - 'nearest' : Nearest neighbor interpolation
-        'bicubic' : Bicubic interpolation
+    - 'bicubic' : Bicubic interpolation
    Linear interpolation is the method of using a line connecting two known quantities
    to determine the value of an unknown quantity between the two known quantities.
@@ -831,8 +833,9 @@ def upsample(
                D_out = D_{in} * scale_{factor}
                H_out = H_{in} * scale_{factor}
                W_out = W_{in} * scale_{factor}
-    https://en.wikipedia.org/wiki/Linear_interpolation.
    For details of linear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Linear_interpolation.
    For details of nearest neighbor interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
@@ -876,6 +879,7 @@ def upsample(
        name(str, optional): The default value is None.
                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
        A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),

--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -23,6 +23,7 @@ __all__ = []
 def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
    r"""
    It computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:
@@ -48,6 +49,7 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
    Returns:
        Tensor, the dtype is same as input tensor.
        - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
          depending on whether the input has data shaped as :math:`[N, D]`.
        - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1310,6 +1310,7 @@ def margin_ranking_loss(
 def l1_loss(input, label, reduction='mean', name=None):
    r"""
    Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
    If `reduction` set to ``'none'``, the loss is:
@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None):
    Returns:
        Tensor, the L1 Loss of Tensor ``input`` and ``label``.
-        If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+        If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
        If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
    Examples:
@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None):
            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
            print(l1_loss.numpy())
            # [1.4]
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(
@@ -2286,6 +2288,7 @@ def cross_entropy(
    name=None,
 ):
    r"""
    By default, this operator implements the cross entropy loss function with softmax. This function
    combines the calculation of the softmax operation and the cross entropy loss function
    to provide a more numerically stable computing.
@@ -2399,21 +2402,13 @@ def cross_entropy(
    Parameters:
+        input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
-        - **input** (Tensor)
-            Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
            Note:
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
-                output of softmax operator, which will produce incorrect results.
                2. when use_softmax=False, it expects the output of softmax operator.
-        - **label** (Tensor)
+        label (Tensor):
            1. If soft_label=False, the shape is
            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
            the data type is int32, int64, float32, float64, where each value is [0, C-1].
@@ -2421,48 +2416,27 @@ def cross_entropy(
            2. If soft_label=True, the shape and data type should be same with ``input`` ,
            and the sum of the labels for each sample should be 1.
-        - **weight** (Tensor, optional)
+        weight (Tensor, optional): a manual rescaling weight given to each class.
-            a manual rescaling weight given to each class.
            If given, has to be a Tensor of size C and the data type is float32, float64.
            Default is ``'None'`` .
+        ignore_index (int64, optional): Specifies a target value that is ignored
-        - **ignore_index** (int64, optional)
-            Specifies a target value that is ignored
            and does not contribute to the loss. A negative value means that no label
            value needs to be ignored. Only valid when soft_label = False.
            Default is ``-100`` .
+        reduction (str, optional): Indicate how to average the loss by batch_size,
-        - **reduction** (str, optional)
-            Indicate how to average the loss by batch_size,
            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
            Default is ``'mean'``.
+        soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
-        - **soft_label** (bool, optional)
+        axis (int, optional):The index of dimension to perform softmax calculations.
-            Indicate whether label is soft.
-            Default is ``False``.
-        - **axis** (int, optional)
-            The index of dimension to perform softmax calculations.
            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
            number of dimensions of input :attr:`input`.
            Default is ``-1`` .
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
-        - **use_softmax** (bool, optional)
-            Indicate whether compute softmax before cross_entropy.
            Default is ``True``.
+        name (str, optional): The name of the operator. Default is ``None`` .
-        - **name** (str, optional)
-            The name of the operator. Default is ``None`` .
            For more information, please refer to :ref:`api_guide_Name` .
    Returns:
@@ -2478,9 +2452,7 @@ def cross_entropy(
        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
    Examples:
        .. code-block:: python
            # hard labels
@@ -3834,6 +3806,7 @@ def triplet_margin_loss(
 def soft_margin_loss(input, label, reduction='mean', name=None):
    """
    The API measures the soft margin loss between input predictions ``input``
    and target labels ``label`` . It can be described as:
@@ -3842,7 +3815,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
    Parameters:
-        input (Tensor): The input predications tensor with shape: [N, *],
+        input (Tensor): The input predications tensor with shape: ``[N, *]``,
            N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
            Available dtype is float32, float64.
@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
    Returns:
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
-            same as ``input`` , else the shape of output is [1].
    Examples:
        .. code-block:: python
@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
            #         [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
            #         [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
            #         [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(

--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1735,16 +1735,18 @@ def adaptive_avg_pool1d(x, output_size, name=None):
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
-    """
+    r"""
    Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size.
    For avg adaptive pool2d:
    ..  math::
-        hstart &= floor(i * H_{in} / H_{out})
+        hstart &= floor(i * H_{in} / H_{out}) \\
-        hend &= ceil((i + 1) * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out}) \\
-        wstart &= floor(j * W_{in} / W_{out})
+        wstart &= floor(j * W_{in} / W_{out}) \\
-        wend &= ceil((j + 1) * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out}) \\
        Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
    Args:
@@ -1753,14 +1755,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
            it must contain two element, (H, W). H and W can be either a int, or None which means
            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format (str, optional): The data format of the input and output data. An optional string
            from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
            the order of: [batch_size, input_channels, input_height, input_width].
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
    Returns:
-        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
    Examples:
        .. code-block:: python
@@ -1788,6 +1791,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
                            x = x,
                            output_size=[3, 3])
            # out.shape is [2, 3, 3, 3]
    """
    if not in_dynamic_mode():
        check_variable_and_dtype(
@@ -1879,35 +1883,37 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
-    """
+    r"""
    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size.
    For avg adaptive pool3d:
    ..  math::
-        dstart &= floor(i * D_{in} / D_{out})
+        dstart &= floor(i * D_{in} / D_{out}) \\
-        dend &= ceil((i + 1) * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out}) \\
-        hstart &= floor(j * H_{in} / H_{out})
+        hstart &= floor(j * H_{in} / H_{out}) \\
-        hend &= ceil((j + 1) * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out}) \\
-        wstart &= floor(k * W_{in} / W_{out})
+        wstart &= floor(k * W_{in} / W_{out}) \\
-        wend &= ceil((k + 1) * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out}) \\
        Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
            {(dend - dstart) * (hend - hstart) * (wend - wstart)}
    Args:
        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
            The data type can be float32, float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
+            list, it must contain three elements, (D, H, W). D, H and W can be either a int,
-            the size will be the same as that of the input.
+            or None which means the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format (str, optional): The data format of the input and output data. An optional string
            from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
            the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
+            Usually name is no need to set and None by default.
-                             None by default.
    Returns:
-        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
    Examples:
        .. code-block:: python
@@ -1937,6 +1943,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
                            x = input_data,
                            output_size=[3, 3, 3])
            # out.shape is [2, 3, 3, 3, 3]
    """
    if not in_dynamic_mode():
        check_variable_and_dtype(

--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -1450,15 +1450,16 @@ class Maxout(Layer):
 class Softmax2D(Layer):
    r"""
    Softmax2D Activation.
    Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
    The sum of result in each location (C, H_i, W_j) will be one.
    Shape:
        - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
-        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input)
+        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)
-    Return:
+    Returns:
        A Tensor of the same shape and dtype as input with value in range [0, 1].
    Examples:
@@ -1483,6 +1484,7 @@ class Softmax2D(Layer):
            #   [[0.42368975 0.51082766 0.47752273 0.5258871 ]
            #    [0.66754097 0.47182566 0.5187628  0.5402329 ]
            #    [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
    """
    def __init__(self, name=None):

--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -20,6 +20,7 @@ __all__ = []
 class PairwiseDistance(Layer):
    r"""
    It computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:
@@ -38,10 +39,10 @@ class PairwiseDistance(Layer):
            Generally, no setting is required. Default: None.
    Shape:
-        x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
+        - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
          is the dimension of the data. Available data type is float32, float64.
-        y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
+        - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
-        output: The same dtype as input tensor.
+        - output: The same dtype as input tensor.
            - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
              depending on whether the input has data shaped as :math:`[N, D]`.
            - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -31,7 +31,8 @@ __all__ = []
 class BCEWithLogitsLoss(Layer):
    r"""
-    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
    layer and some reduce operations.
@@ -86,21 +87,21 @@ class BCEWithLogitsLoss(Layer):
            For more information, please refer to :ref:`api_guide_Name`.
    Shapes:
-        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+        - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
          N is batch_size, `*` means number of additional dimensions. The ``logit``
          is usually the output of Linear layer. Available dtype is float32, float64.
-        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+        - label (Tensor): The target labels tensor. 2-D tensor with the same shape as
          ``logit``. The target labels which values should be numbers between 0 and 1.
          Available dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
          same as ``logit`` , else the shape of output is scalar.
    Returns:
        A callable object of BCEWithLogitsLoss.
    Examples:
        .. code-block:: python
            import paddle
            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
@@ -139,6 +140,7 @@ class BCEWithLogitsLoss(Layer):
 class CrossEntropyLoss(Layer):
    r"""
    By default, this operator implements the cross entropy loss function with softmax. This function
    combines the calculation of the softmax operation and the cross entropy loss function
    to provide a more numerically stable computing.
@@ -251,60 +253,35 @@ class CrossEntropyLoss(Layer):
    Parameters:
+        weight (Tensor, optional): a manual rescaling weight given to each class.
-        - **weight** (Tensor, optional)
-            a manual rescaling weight given to each class.
            If given, has to be a Tensor of size C and the data type is float32, float64.
            Default is ``'None'`` .
+        ignore_index (int64, optional): Specifies a target value that is ignored
-        - **ignore_index** (int64, optional)
-            Specifies a target value that is ignored
            and does not contribute to the loss. A negative value means that no label
            value needs to be ignored. Only valid when soft_label = False.
            Default is ``-100`` .
+        reduction (str, optional): Indicate how to average the loss by batch_size,
-        - **reduction** (str, optional)
-            Indicate how to average the loss by batch_size,
            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
            Default is ``'mean'``.
+        soft_label (bool, optional): Indicate whether label is soft.
-        - **soft_label** (bool, optional)
-            Indicate whether label is soft.
            If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
            Default is ``False``.
+        axis (int, optional): The index of dimension to perform softmax calculations.
-        - **axis** (int, optional)
-            The index of dimension to perform softmax calculations.
            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
            of dimensions of input :attr:`input`.
            Default is ``-1`` .
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
-        - **use_softmax** (bool, optional)
-            Indicate whether compute softmax before cross_entropy.
            Default is ``True``.
+        name (str, optional): The name of the operator. Default is ``None`` .
-        - **name** (str, optional)
-            The name of the operator. Default is ``None`` .
            For more information, please refer to :ref:`api_guide_Name` .
    Shape:
+        - **input** (Tensor), the data type is float32, float64. Shape is
-        - **input** (Tensor)
-            Input tensor, the data type is float32, float64. Shape is
          :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
            Note:
                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
@@ -312,7 +289,6 @@ class CrossEntropyLoss(Layer):
                2. when use_softmax=False, it expects the output of softmax operator.
        - **label** (Tensor)
            1. If soft_label=False, the shape is
@@ -322,14 +298,9 @@ class CrossEntropyLoss(Layer):
            2. If soft_label=True, the shape and data type should be same with ``input`` ,
            and the sum of the labels for each sample should be 1.
-        - **output** (Tensor)
+        - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
-            Return the softmax cross_entropy loss of ``input`` and ``label``.
          The data type is the same as input.
          If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
          If :attr:`reduction` is ``'none'``:
            1. If soft_label = False, the dimension of return value is the same with ``label`` .
@@ -634,6 +605,7 @@ class MSELoss(Layer):
 class L1Loss(Layer):
    r"""
    Construct a callable object of the ``L1Loss`` class.
    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
@@ -663,10 +635,10 @@ class L1Loss(Layer):
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Shape:
-        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
-        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
-        output (Tensor): The L1 Loss of ``input`` and ``label``.
+        - output (Tensor): The L1 Loss of ``input`` and ``label``.
-            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+          If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
          If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
    Examples:
@@ -692,6 +664,7 @@ class L1Loss(Layer):
            print(output)
            # [[0.20000005 0.19999999]
            # [0.2        0.79999995]]
    """
    def __init__(self, reduction='mean', name=None):
@@ -712,6 +685,7 @@ class L1Loss(Layer):
 class BCELoss(Layer):
    """
    This interface is used to construct a callable object of the ``BCELoss`` class.
    The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
    and target labels ``label`` . The binary_cross_entropy loss can be described as:
@@ -755,13 +729,13 @@ class BCELoss(Layer):
            For more information, please refer to :ref:`api_guide_Name`.
    Shape:
-        input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means
+        - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
          number of additional dimensions. The input ``input`` should always
          be the output of sigmod.  Available dtype is float32, float64.
-        label (Tensor): 2-D tensor with the same shape as ``input``. The target
+        - label (Tensor): 2-D tensor with the same shape as ``input``. The target
          labels which values should be numbers between 0 and 1. Available
          dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
          same as ``input`` , else the shape of output is scalar.
    Returns:
@@ -914,6 +888,7 @@ class NLLLoss(Layer):
 class KLDivLoss(Layer):
    r"""
    Generate a callable object of 'KLDivLoss' to calculate the
    Kullback-Leibler divergence loss between Input(X) and
    Input(Target). Notes that Input(X) is the log-probability
@@ -933,14 +908,10 @@ class KLDivLoss(Layer):
             Default is ``'mean'``.
    Shape:
+        - input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
-        - input (Tensor): (N, *), where * means, any number of additional dimensions.
+        - label (Tensor): ``(N, *)``, same shape as input.
-        - label (Tensor): (N, *), same shape as input.
        - output (Tensor): tensor with shape: [1] by default.
    Examples:
        .. code-block:: python
@@ -970,6 +941,7 @@ class KLDivLoss(Layer):
            kldiv_criterion = nn.KLDivLoss(reduction='none')
            pred_loss = kldiv_criterion(x, target)
            # shape=[5, 20]
    """
    def __init__(self, reduction='mean'):
@@ -1720,6 +1692,7 @@ class TripletMarginLoss(Layer):
 class SoftMarginLoss(Layer):
    r"""
    Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
    and target labels ``label`` . It can be described as:
@@ -1738,16 +1711,13 @@ class SoftMarginLoss(Layer):
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Shapes:
+        - Input (Tensor): The input tensor with shape: ``[N, *]``,
-        Input (Tensor): The input tensor with shape: [N, *],
          N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
          Available dtype is float32, float64.
+        - Label (Tensor): The target labels tensor with the same shape as
-        Label (Tensor): The target labels tensor with the same shape as
          ``input``. The target labels which values should be numbers -1 or 1.
          Available dtype is int32, int64, float32, float64.
+        - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
          same as ``input`` , else the shape of output is [1].
    Returns:
@@ -1780,6 +1750,7 @@ class SoftMarginLoss(Layer):
            #         [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511],
            #         [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399],
            #         [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]])
    """
    def __init__(self, reduction='mean', name=None):

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -321,6 +321,7 @@ Where `H` means height of feature map, `W` means width of feature map.
 class GroupNorm(Layer):
    """
    This interface is used to construct a callable object of the ``GroupNorm`` class.
    For more details, refer to code examples.
    It implements the function of the Group Normalization Layer.
@@ -341,7 +342,7 @@ class GroupNorm(Layer):
        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
    Shape:
-        - x: Tensor with shape: (batch, num_features, *).
+        - x: Tensor with shape: attr:`(batch, num_features, *)`.
        - output: The same shape as input x.
    Returns:
@@ -1047,6 +1048,7 @@ class BatchNorm3D(_BatchNormBase):
 class SyncBatchNorm(_BatchNormBase):
    r"""
    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
    be used as a normalizer function for other operations, such as conv2d and fully connected 
@@ -1092,9 +1094,9 @@ class SyncBatchNorm(_BatchNormBase):
    - :math:`\beta` : trainable shift parameter vector 
    Note:
-        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
+        If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the
-        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
+        evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of
-        ``list`` to pack the model. 
+        :ref:`api_paddle_hub_list` to pack the model.
    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -1112,8 +1114,8 @@ class SyncBatchNorm(_BatchNormBase):
             have trainable bias parameter. Default: None.
    Shapes:
-        input: Tensor that the dimension from 2 to 5.
+        - input: Tensor that the dimension from 2 to 5.
-        output: Tensor with the same shape as input.
+        - output: Tensor with the same shape as input.
    Examples:
        .. code-block:: python
@@ -1135,6 +1137,7 @@ class SyncBatchNorm(_BatchNormBase):
                #          [[ 0.80956620, -0.66528702],
                #           [-1.27446556,  1.13018656]]]])
    """
    def __init__(
@@ -1284,8 +1287,8 @@ class SyncBatchNorm(_BatchNormBase):
            The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
        Examples:
            .. code-block:: python
                import paddle
                import paddle.nn as nn

--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -224,6 +224,7 @@ class AvgPool2D(Layer):
 class AvgPool3D(Layer):
    """
    This operation applies 3D max pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
    in NCDHW format, where N is batch size, C is the number of channels,
@@ -264,6 +265,7 @@ class AvgPool3D(Layer):
          The data type can be float32, float64.
        - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
          The data type is same as input x.
    Examples:
        .. code-block:: python

--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -514,14 +514,17 @@ class QuantizedConv2D(Layer):
 class QuantizedConv2DTranspose(Layer):
    """
    The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
    The only difference is that its inputs are all fake quantized.
    Examples:
       .. code-block:: python
          import paddle
          import paddle.nn as nn
          from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
          x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
          conv = nn.Conv2DTranspose(4, 6, (3, 3))
          conv_quantized = QuantizedConv2DTranspose(conv)
@@ -531,6 +534,7 @@ class QuantizedConv2DTranspose(Layer):
          y_np = y_var.numpy()
          print(y_np.shape, y_quantized_np.shape)
          # (2, 6, 10, 10), (2, 6, 10, 10)
    """
    def __init__(self,

--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1661,6 +1661,7 @@ class MultiplicativeDecay(LRScheduler):
 class OneCycleLR(LRScheduler):
    r"""
    Sets the learning rate according to the one cycle learning rate scheduler.
    The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
    from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
@@ -1674,22 +1675,25 @@ class OneCycleLR(LRScheduler):
    Also note that you should update learning rate each step.
    Args:
-        max_learning_rate (float): The maximum learning rate. It is a python float number.
+        max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
-             Functionally, it defines the initial learning rate by ``divide_factor`` .
        total_steps (int): Number of total training steps.
-        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
-        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
-            'linear' for linear annealing. Default: 'cos'.
        three_phase (bool, optional): Whether to use three phase.
            If ``True``:
                1. The learning rate will first increase from initial learning rate to maximum learning rate.
                2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
                3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
            If ``False``:
                1. The learning rate will increase to maximum learning rate.
                2. Then it will directly decrease to minimum learning rate.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -1741,6 +1745,7 @@ class OneCycleLR(LRScheduler):
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # You should update learning rate each step
    """
    def __init__(

--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -124,7 +124,8 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
        if frame_length > x.shape[axis]:
            raise ValueError(
                f'Attribute frame_length should be less equal than sequence length, '
-                f'but got ({frame_length}) > ({x.shape[axis]}).')
+                f'but got ({frame_length}) > ({x.shape[axis]}).'
+            )
    op_type = 'frame'
@@ -132,25 +133,33 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
        return _C_ops.frame(x, frame_length, hop_length, axis)
    if _in_legacy_dygraph():
-        attrs = ('frame_length', frame_length, 'hop_length', hop_length, 'axis',
+        attrs = (
-                 axis)
+            'frame_length',
+            frame_length,
+            'hop_length',
+            hop_length,
+            'axis',
+            axis,
+        )
        op = getattr(_legacy_C_ops, op_type)
        out = op(x, *attrs)
    else:
        check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
-            op_type)
+        )
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type=op_type,
+        helper.append_op(
+            type=op_type,
            inputs={'X': x},
            attrs={
                'frame_length': frame_length,
                'hop_length': hop_length,
-                             'axis': axis
+                'axis': axis,
            },
-                         outputs={'Out': out})
+            outputs={'Out': out},
+        )
    return out
@@ -225,22 +234,22 @@ def overlap_add(x, hop_length, axis=-1, name=None):
        out = op(x, *attrs)
    else:
        check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
-            op_type)
+        )
        helper = LayerHelper(op_type, **locals())
        dtype = helper.input_dtype(input_param_name='x')
        out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type=op_type,
+        helper.append_op(
+            type=op_type,
            inputs={'X': x},
-                         attrs={
+            attrs={'hop_length': hop_length, 'axis': axis},
-                             'hop_length': hop_length,
+            outputs={'Out': out},
-                             'axis': axis
+        )
-                         },
-                         outputs={'Out': out})
    return out
-def stft(x,
+def stft(
+    x,
    n_fft,
    hop_length=None,
    win_length=None,
@@ -249,8 +258,10 @@ def stft(x,
    pad_mode='reflect',
    normalized=False,
    onesided=True,
-         name=None):
+    name=None,
+):
    r"""
    Short-time Fourier transform (STFT).
    The STFT computes the discrete Fourier transforms (DFT) of short overlapping
@@ -263,9 +274,12 @@ def stft(x,
    Where:
    - :math:`t`: The :math:`t`-th input window.
    - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
      or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`.
    - :math:`N`: Value of `n_fft`.
    - :math:`H`: Value of `hop_length`.
    Args:
@@ -292,9 +306,9 @@ def stft(x,
            to set this property. For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`(
+        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`
-            real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
+        (real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`
-            `onesided` is `False`)
+        (`onesided` is `False`)
    Examples:
        .. code-block:: python
@@ -311,14 +325,17 @@ def stft(x,
            x = paddle.randn([8, 48000], dtype=paddle.float64) + \
                    paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
            y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
    """
-    check_variable_and_dtype(x, 'x',
+    check_variable_and_dtype(
-                             ['float32', 'float64', 'complex64', 'complex128'],
+        x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft'
-                             'stft')
+    )
    x_rank = len(x.shape)
-    assert x_rank in [1, 2], \
+    assert x_rank in [
-        f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
+        1,
+        2,
+    ], f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
    if x_rank == 1:  # (batch, seq_length)
        x = x.unsqueeze(0)
@@ -326,69 +343,77 @@ def stft(x,
    if hop_length is None:
        hop_length = int(n_fft // 4)
-    assert hop_length > 0, \
+    assert hop_length > 0, f'hop_length should be > 0, but got {hop_length}.'
-        f'hop_length should be > 0, but got {hop_length}.'
    if win_length is None:
        win_length = n_fft
    if _non_static_mode():
-        assert 0 < n_fft <= x.shape[-1], \
+        assert (
-            f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
+            0 < n_fft <= x.shape[-1]
+        ), f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
-    assert 0 < win_length <= n_fft, \
+    assert (
-        f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
+        0 < win_length <= n_fft
+    ), f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
    if window is not None:
-        assert len(window.shape) == 1 and len(window) == win_length, \
+        assert (
-            f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
+            len(window.shape) == 1 and len(window) == win_length
+        ), f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
    else:
-        window = paddle.ones(shape=(win_length, ), dtype=x.dtype)
+        window = paddle.ones(shape=(win_length,), dtype=x.dtype)
    if win_length < n_fft:
        pad_left = (n_fft - win_length) // 2
        pad_right = n_fft - win_length - pad_left
-        window = paddle.nn.functional.pad(window,
+        window = paddle.nn.functional.pad(
-                                          pad=[pad_left, pad_right],
+            window, pad=[pad_left, pad_right], mode='constant'
-                                          mode='constant')
+        )
    if center:
-        assert pad_mode in ['constant', 'reflect'], \
+        assert pad_mode in [
-            'pad_mode should be "reflect" or "constant", but got "{}".'.format(pad_mode)
+            'constant',
+            'reflect',
+        ], 'pad_mode should be "reflect" or "constant", but got "{}".'.format(
+            pad_mode
+        )
        pad_length = n_fft // 2
        # FIXME: Input `x` can be a complex tensor but pad does not supprt complex input.
-        x = paddle.nn.functional.pad(x.unsqueeze(-1),
+        x = paddle.nn.functional.pad(
+            x.unsqueeze(-1),
            pad=[pad_length, pad_length],
            mode=pad_mode,
-                                     data_format="NLC").squeeze(-1)
+            data_format="NLC",
+        ).squeeze(-1)
    x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1)
    x_frames = x_frames.transpose(
-        perm=[0, 2,
+        perm=[0, 2, 1]
-              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
    x_frames = paddle.multiply(x_frames, window)
    norm = 'ortho' if normalized else 'backward'
    if is_complex(x_frames):
-        assert not onesided, \
+        assert (
-            'onesided should be False when input or window is a complex Tensor.'
+            not onesided
+        ), 'onesided should be False when input or window is a complex Tensor.'
    if not is_complex(x):
-        out = fft_r2c(x=x_frames,
+        out = fft_r2c(
+            x=x_frames,
            n=None,
            axis=-1,
            norm=norm,
            forward=True,
            onesided=onesided,
-                      name=name)
+            name=name,
+        )
    else:
-        out = fft_c2c(x=x_frames,
+        out = fft_c2c(
-                      n=None,
+            x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name
-                      axis=-1,
+        )
-                      norm=norm,
-                      forward=True,
-                      name=name)
    out = out.transpose(perm=[0, 2, 1])  # (batch, n_fft, num_frames)
@@ -398,7 +423,8 @@ def stft(x,
    return out
-def istft(x,
+def istft(
+    x,
    n_fft,
    hop_length=None,
    win_length=None,
@@ -408,7 +434,8 @@ def istft(x,
    onesided=True,
    length=None,
    return_complex=False,
-          name=None):
+    name=None,
+):
    r"""
    Inverse short-time Fourier transform (ISTFT).
@@ -484,8 +511,12 @@ def istft(x,
    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')
    x_rank = len(x.shape)
-    assert x_rank in [2, 3], \
+    assert x_rank in [
-        'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(x_rank)
+        2,
+        3,
+    ], 'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(
+        x_rank
+    )
    if x_rank == 2:  # (batch, n_fft, n_frames)
        x = x.unsqueeze(0)
@@ -497,83 +528,107 @@ def istft(x,
        win_length = n_fft
    # Assure no gaps between frames.
-    assert 0 < hop_length <= win_length, \
+    assert (
-        'hop_length should be in (0, win_length({})], but got {}.'.format(win_length, hop_length)
+        0 < hop_length <= win_length
+    ), 'hop_length should be in (0, win_length({})], but got {}.'.format(
+        win_length, hop_length
+    )
-    assert 0 < win_length <= n_fft, \
+    assert (
-        'win_length should be in (0, n_fft({})], but got {}.'.format(n_fft, win_length)
+        0 < win_length <= n_fft
+    ), 'win_length should be in (0, n_fft({})], but got {}.'.format(
+        n_fft, win_length
+    )
    n_frames = x.shape[-1]
    fft_size = x.shape[-2]
    if _non_static_mode():
        if onesided:
-            assert (fft_size == n_fft // 2 + 1), \
+            assert (
-                'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
+                fft_size == n_fft // 2 + 1
+            ), 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(
+                n_fft // 2 + 1, fft_size
+            )
        else:
-            assert (fft_size == n_fft), \
+            assert (
-                'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
+                fft_size == n_fft
+            ), 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(
+                n_fft, fft_size
+            )
    if window is not None:
-        assert len(window.shape) == 1 and len(window) == win_length, \
+        assert (
-            'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
+            len(window.shape) == 1 and len(window) == win_length
+        ), 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(
+            win_length, window.shape
+        )
    else:
-        window_dtype = paddle.float32 if x.dtype in [
+        window_dtype = (
-            paddle.float32, paddle.complex64
+            paddle.float32
-        ] else paddle.float64
+            if x.dtype in [paddle.float32, paddle.complex64]
-        window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
+            else paddle.float64
+        )
+        window = paddle.ones(shape=(win_length,), dtype=window_dtype)
    if win_length < n_fft:
        pad_left = (n_fft - win_length) // 2
        pad_right = n_fft - win_length - pad_left
        # FIXME: Input `window` can be a complex tensor but pad does not supprt complex input.
-        window = paddle.nn.functional.pad(window,
+        window = paddle.nn.functional.pad(
-                                          pad=[pad_left, pad_right],
+            window, pad=[pad_left, pad_right], mode='constant'
-                                          mode='constant')
+        )
    x = x.transpose(
-        perm=[0, 2,
+        perm=[0, 2, 1]
-              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
    norm = 'ortho' if normalized else 'backward'
    if return_complex:
-        assert not onesided, \
+        assert (
-            'onesided should be False when input(output of istft) or window is a complex Tensor.'
+            not onesided
+        ), 'onesided should be False when input(output of istft) or window is a complex Tensor.'
        out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
    else:
-        assert not is_complex(window), \
+        assert not is_complex(
-            'Data type of window should not be complex when return_complex is False.'
+            window
+        ), 'Data type of window should not be complex when return_complex is False.'
        if onesided is False:
-            x = x[:, :, :n_fft // 2 + 1]
+            x = x[:, :, : n_fft // 2 + 1]
        out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
    out = paddle.multiply(out, window).transpose(
-        perm=[0, 2, 1])  # (batch, n_fft, num_frames)
+        perm=[0, 2, 1]
-    out = overlap_add(x=out, hop_length=hop_length,
+    )  # (batch, n_fft, num_frames)
-                      axis=-1)  # (batch, seq_length)
+    out = overlap_add(
+        x=out, hop_length=hop_length, axis=-1
+    )  # (batch, seq_length)
    window_envelop = overlap_add(
        x=paddle.tile(
            x=paddle.multiply(window, window).unsqueeze(0),
-            repeat_times=[n_frames,
+            repeat_times=[n_frames, 1],
-                          1]).transpose(perm=[1, 0]),  # (n_fft, num_frames)
+        ).transpose(
+            perm=[1, 0]
+        ),  # (n_fft, num_frames)
        hop_length=hop_length,
-        axis=-1)  # (seq_length, )
+        axis=-1,
+    )  # (seq_length, )
    if length is None:
        if center:
-            out = out[:, (n_fft // 2):-(n_fft // 2)]
+            out = out[:, (n_fft // 2) : -(n_fft // 2)]
-            window_envelop = window_envelop[(n_fft // 2):-(n_fft // 2)]
+            window_envelop = window_envelop[(n_fft // 2) : -(n_fft // 2)]
    else:
        if center:
            start = n_fft // 2
        else:
            start = 0
-        out = out[:, start:start + length]
+        out = out[:, start : start + length]
-        window_envelop = window_envelop[start:start + length]
+        window_envelop = window_envelop[start : start + length]
    # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
    if _non_static_mode() and window_envelop.abs().min().item() < 1e-11:

--- a/python/paddle/sparse/nn/layer/activation.py
+++ b/python/paddle/sparse/nn/layer/activation.py
@@ -20,6 +20,7 @@ __all__ = []
 class ReLU(Layer):
    """
    Sparse ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
    .. math::
@@ -44,6 +45,7 @@ class ReLU(Layer):
            relu = paddle.sparse.nn.ReLU()
            out = relu(sparse_x)
            # [0., 0., 1.]
    """
    def __init__(self, name=None):
@@ -59,7 +61,8 @@ class ReLU(Layer):
 class Softmax(Layer):
-    """
+    r"""
    Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
    Note:
@@ -126,6 +129,7 @@ class Softmax(Layer):
 class ReLU6(Layer):
    """
    Sparse ReLU6 Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
    .. math::
@@ -149,6 +153,7 @@ class ReLU6(Layer):
            sparse_x = dense_x.to_sparse_coo(1)
            relu6 = paddle.sparse.nn.ReLU6()
            out = relu6(sparse_x)
    """
    def __init__(self, name=None):
@@ -164,7 +169,8 @@ class ReLU6(Layer):
 class LeakyReLU(Layer):
-    """
+    r"""
    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
    .. math::
@@ -196,6 +202,7 @@ class LeakyReLU(Layer):
            sparse_x = dense_x.to_sparse_coo(1)
            leaky_relu = paddle.sparse.nn.LeakyReLU(0.5)
            out = leaky_relu(sparse_x)
    """
    def __init__(self, negative_slope=0.01, name=None):

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1180,7 +1180,8 @@ def triu(x, diagonal=0, name=None):
 def meshgrid(*args, **kwargs):
    """
-    Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
+    Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.
    Args:
        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),

--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -22,9 +22,17 @@ from .math import multiply
 from .math import sum as paddle_sum
 from ..fluid.framework import _in_legacy_dygraph
 from paddle import _C_ops, _legacy_C_ops
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+from ..fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 import collections
 import string
 import opt_einsum
@@ -52,12 +60,13 @@ def parse_op_labels(labelstr, operand):
    '''
    # Sanity checks
    for c in labelstr.replace('.', ''):
-        assert c.isalpha(), (
+        assert (
-            f"Invalid equation: {c} is not a valid label, which should be letters."
+            c.isalpha()
-        )
+        ), f"Invalid equation: {c} is not a valid label, which should be letters."
-    assert labelstr.replace('...', '', 1).find('.') == -1, (
+    assert (
-        f"Invalid equation: `.` is found outside of an ellipsis.")
+        labelstr.replace('...', '', 1).find('.') == -1
+    ), f"Invalid equation: `.` is found outside of an ellipsis."
    # Check shape. Note, in Paddle a tensor rank is always nonzero
    ndims = len(operand.shape)
@@ -65,8 +74,9 @@ def parse_op_labels(labelstr, operand):
    full_labelstr = labelstr.replace('...', '.' * (ndims - len(labelstr) + 3))
-    assert len(full_labelstr) == ndims, (
+    assert (
-        f"Invalid equation: the label string '{labelstr}' misses dimensions.")
+        len(full_labelstr) == ndims
+    ), f"Invalid equation: the label string '{labelstr}' misses dimensions."
    return full_labelstr
@@ -90,7 +100,8 @@ def parse_labels(labelstr, operands):
    nop_labels = labelstr.split(',')
    assert len(nop_labels) == len(operands), (
        f"Invalid equation: the number of operands is {len(operands)}, "
-        f"but found {len(nop_labels)} segments in the label equation.")
+        f"but found {len(nop_labels)} segments in the label equation."
+    )
    return list(map(parse_op_labels, nop_labels, operands))
@@ -101,8 +112,9 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
    '''
    # Sanity check.
    if n_bcast_dims > 0:
-        assert '...' in rhs, (
+        assert (
-            f"Invalid equation: missing ellipsis in output labels.")
+            '...' in rhs
+        ), f"Invalid equation: missing ellipsis in output labels."
    rhs = rhs.replace('...', '')
    rhs_set = set(rhs)
@@ -114,10 +126,12 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
    non_input_labels = rhs_set.difference(input_labels)
    assert not non_input_labels, (
        f"Invalid equation: "
-        f"output label {sorted(non_input_labels)} not used by any input.")
+        f"output label {sorted(non_input_labels)} not used by any input."
+    )
    # Verify that output labels are not duplicate
-    assert len(rhs) == len(rhs_set), (
+    assert len(rhs) == len(
-        f"Invalid equation: duplicate output labels are found.")
+        rhs_set
+    ), f"Invalid equation: duplicate output labels are found."
 def build_view(in_labels, out_labels):
@@ -159,8 +173,8 @@ def build_view(in_labels, out_labels):
        # fill the broadcast dimension indices from right to left.
        if s:
            for ax, dim in zip(
-                    range(start, end)[::-1],
+                range(start, end)[::-1], range(s.start(), s.end())[::-1]
-                    range(s.start(), s.end())[::-1]):
+            ):
                inv_map[ax] = dim
    # Now work on non-broadcast dimensions
@@ -219,7 +233,8 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
        g_labels_out = rhs.replace('...', '.' * n_bcast_dims)
    else:
        g_labels_out = '.' * n_bcast_dims + ''.join(
-            l for l, c in zip(labels, count) if c == 1)
+            l for l, c in zip(labels, count) if c == 1
+        )
    for i in range(len(count))[::-1]:
        if labels[i] in g_labels_out:
@@ -267,12 +282,14 @@ def build_global_shape(g_view, g_labels, op_shapes):
    assert not non_bcastable, (
        f"Invalid operands: label {g_labels[non_bcastable[0]]} "
-        f"corresponds to non-broadcastable dimensions.")
+        f"corresponds to non-broadcastable dimensions."
+    )
    g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]
-    g_masks = [[s > 1 or s == -1 for s in view_shape]
+    g_masks = [
-               for view_shape in view_shapes]
+        [s > 1 or s == -1 for s in view_shape] for view_shape in view_shapes
+    ]
    return g_shape, g_masks
@@ -297,8 +314,9 @@ def diagonalize(labels, operand):
    --------
    'ijj...i' would be merged into 'ij...'
    '''
-    assert not has_duplicated_labels(labels), (
+    assert not has_duplicated_labels(
-        f'Duplicate labels are not supported.')
+        labels
+    ), f'Duplicate labels are not supported.'
    return labels, operand
@@ -358,12 +376,21 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
        plan.add_step(step)
    # Check if conditions hold for turnning the operation into a matmul
-    if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate(
+    if (
-        (op1_vshape, op2_vshape)):
+        j1 + j2 > 0
-        op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1])
+        and k > 0
-                                           ] + [np.prod(op1_vshape[K])]
+        and -1 not in np.concatenate((op1_vshape, op2_vshape))
-        op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2])
+    ):
-                                           ] + [np.prod(op2_vshape[K])]
+        op1_shape = (
+            list(op1_vshape[I])
+            + [np.prod(op1_vshape[J1])]
+            + [np.prod(op1_vshape[K])]
+        )
+        op2_shape = (
+            list(op2_vshape[I])
+            + [np.prod(op2_vshape[J2])]
+            + [np.prod(op2_vshape[K])]
+        )
        # Merge J dims and K dims by reshaping
        step = reshape, [var1], var1, op1_shape
@@ -412,15 +439,22 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
            step = squeeze, [var2], var2, [-1, -2]
            plan.add_step(step)
        elif j1 + j2 == 0 and not -1 in np.concatenate(
-            (op1_vshape[K], op2_vshape[K])):
+            (op1_vshape[K], op2_vshape[K])
+        ):
            assert all(op1_vshape[K] == op2_vshape[K])
-            step = reshape, [
+            step = (
-                var1
+                reshape,
-            ], var1, list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])]
+                [var1],
+                var1,
+                list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])],
+            )
            plan.add_step(step)
-            step = reshape, [
+            step = (
-                var2
+                reshape,
-            ], var2, list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])]
+                [var2],
+                var2,
+                list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])],
+            )
            plan.add_step(step)
            step = matmul, [var1, var2], var2, False, True
            plan.add_step(step)
@@ -449,8 +483,9 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
    g_view[op2] = list(op2_view)
-def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
+def plan_summation(
-                   n_bcast):
+    plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast
+):
    '''
    Plan various kinds of summation
    '''
@@ -464,8 +499,9 @@ def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
    I, K, J1, J2 = list(range(n_bcast)), [], [], []
-    for ax, dim1, dim2 in zip(range(n_bcast, ndim), op1_view[n_bcast:],
+    for ax, dim1, dim2 in zip(
-                              op2_view[n_bcast:]):
+        range(n_bcast, ndim), op1_view[n_bcast:], op2_view[n_bcast:]
+    ):
        if (dim1 != -1) != (dim2 != -1):
            if dim1 != -1:
@@ -531,7 +567,6 @@ def plan_broadcast(plan, operands, nop_axes):
 class Plan:
    def __init__(self):
        self.env = {}
        self.steps = []
@@ -635,8 +670,9 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
            # op1 is a one element tensor.
            plan_scalar_prod(plan, i - 1, i)
        else:
-            plan_summation(plan, g_view, i - 1, i, g_supports, g_shape, g_count,
+            plan_summation(
-                           n_bcast)
+                plan, g_view, i - 1, i, g_supports, g_shape, g_count, n_bcast
+            )
    # for ax, dim in enumerate(g_view[nop-1][:nout]):
    #     assert dim == ax
@@ -678,7 +714,9 @@ def preprocess(equation, *operands):
    """
    equation = equation.replace(" ", "")
    nop = len(operands)
-    assert nop > 0, "Required at least one operand in Einsum API, but received %s " % nop
+    assert nop > 0, (
+        "Required at least one operand in Einsum API, but received %s " % nop
+    )
    # Part the equation to left hand side and right hand side
    lhs, *rhs = equation.lower().split('->')
@@ -692,22 +730,27 @@ def preprocess(equation, *operands):
    assert len(lhs.split(',')) == len(operands), (
        f"Invalid equation: the number of operands is {len(operands)}, "
-        f"but found {len(lhs.split(','))} segments in the label equation.")
+        f"but found {len(lhs.split(','))} segments in the label equation."
+    )
-    assert not ('...' in lhs and '...' not in rhs
+    assert not (
+        '...' in lhs and '...' not in rhs
    ), f'Invalid equation: missing ellipsis in output labels.'
-    assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) >
+    assert not (
-                0), f'Duplicate labels are not supported.'
+        len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
+    ), f'Duplicate labels are not supported.'
    assert not has_duplicated_labels(
-        rhs), f'Invalid equation: duplicate output labels are found.'
+        rhs
+    ), f'Invalid equation: duplicate output labels are found.'
    return lhs, rhs, labels
 def parse_fake_shape(equation, operands, labels):
    """
    this shape is just used for operands planning. may differ with the original shape.
    for example:
    ... is replaced by 1
@@ -715,14 +758,15 @@ def parse_fake_shape(equation, operands, labels):
    Results
    -------
    list of shape
    """
    shaped = collections.namedtuple('shaped', ['shape'])
    def fake_shape(label, op):
-        assert len(op.shape) == len(
+        assert len(op.shape) == len(label), (
-            label
+            "length of shape and length of label must be the same, but received %d != %d"
-        ), "length of shape and length of label must be the same, but received %d != %d" % (
+            % (len(op.shape), len(label))
-            len(op.shape), len(label))
+        )
        fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.']
        fakes = list(map(abs, fakes))  # make -1 -> 1
        if '.' in label:
@@ -734,7 +778,6 @@ def parse_fake_shape(equation, operands, labels):
 def rhs_inference(lhs):
    def is_free(key):
        return cnt.get(key) == 1 and key not in ['.', ',']
@@ -753,7 +796,8 @@ def gen_equation_for_opteinsum(lhs, rhs):
    def get_used_label(counter):
        used = set(counter.elements())
        for c in string.ascii_lowercase:
-            if c not in used: return c
+            if c not in used:
+                return c
        raise ValueError(
            "You have used all `a` - `z`, there can't find a unused for einsum optimization"
        )
@@ -786,14 +830,15 @@ def einsum_v2(equation, *operands):
    var_list = list(operands)
    for path in cons:
        (a, b), _, eq, *__ = path
-        assert a > b, "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
+        assert (
+            a > b
+        ), "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
        var_s = [var_list.pop(a), var_list.pop(b)]
        eq = eq.replace(broadcast_label, "...")
        var_list.append(gen_einsum_op(eq, *var_s))
-    assert len(
+    assert (
-        var_list
+        len(var_list) == 1
-    ) == 1, "There must be one elements in list, but received %d." % len(
+    ), "There must be one elements in list, but received %d." % len(var_list)
-        var_list)
    return var_list[0]
@@ -807,8 +852,9 @@ def gen_einsum_op(equation, *operands):
    if _in_legacy_dygraph():
        # dygraph
-        return _legacy_C_ops.einsum(operands, len(operands), len(operands),
+        return _legacy_C_ops.einsum(
-                                    'equation', equation)[0]
+            operands, len(operands), len(operands), 'equation', equation
+        )[0]
    for inp in operands:
        check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
@@ -825,19 +871,18 @@ def gen_einsum_op(equation, *operands):
        helper.create_variable_for_type_inference(dtype=operands[0].dtype)
        for i in range(len(operands))
    ]
-    helper.append_op(type='einsum',
+    helper.append_op(
+        type='einsum',
        inputs={'Operands': operands},
-                     outputs={
+        outputs={'Out': out, "InnerCache": caches, "XShape": xshape},
-                         'Out': out,
+        attrs=attrs,
-                         "InnerCache": caches,
+    )
-                         "XShape": xshape
-                     },
-                     attrs=attrs)
    return out
 def einsum(equation, *operands):
    r"""
    einsum(equation, *operands)
    The current version of this API should be used in dygraph only mode.
@@ -873,8 +918,7 @@ def einsum(equation, *operands):
          dimensions into broadcasting dimensions.
        - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
          dimensions will be reduced and removed in the output.
-        - Output labels can be explicitly specified on the right hand side of `->` or omitted.
+        - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
-        In the latter case, the output labels will be inferred from the input labels.
            - Inference of output labels
                - Broadcasting label `...`, if present, is put on the leftmost position.
                - Free labels are reordered alphabetically and put after `...`.
@@ -884,10 +928,11 @@ def einsum(equation, *operands):
                  the sum over the original output.
                - Non-input labels are invalid.
                - Duplicate labels are invalid.
-                - For any dummmy label which is present for the output, it's promoted to
+                - For any dummy label which is present for the output, it's promoted to
                  a free label.
                - For any free label which is not present for the output, it's lowered to
                  a dummy label.
        - Examples
            - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
              string is '...ik'
@@ -920,7 +965,7 @@ def einsum(equation, *operands):
            operands should equal the number of input terms in the equation.
    Returns:
-        result (`Tensor`): the result tensor.
+        result (`Tensor`), the result tensor.
    Examples:
        .. code-block:: python
@@ -992,8 +1037,10 @@ def einsum(equation, *operands):
            #    [[0.32043904, 0.18164253, 0.27810261],
            #     [0.50226176, 0.24512935, 0.39881429],
            #     [0.51476848, 0.23367381, 0.39229113]]])
    """
    import os
    if int(os.environ.get('FLAGS_new_einsum', "1")):
        return einsum_v2(equation, *operands)
@@ -1039,9 +1086,11 @@ def einsum(equation, *operands):
    #   Counting how many non-trivial dimensions remain for each ax
    g_labels, g_view, g_nout, g_count = build_global_view(
-        nop_labels, rhs, n_bcast_dims)
+        nop_labels, rhs, n_bcast_dims
-    g_shape, g_supports = build_global_shape(g_view, g_labels,
+    )
-                                             [op.shape for op in operands])
+    g_shape, g_supports = build_global_shape(
+        g_view, g_labels, [op.shape for op in operands]
+    )
    # Now we're ready to build up an execution plan
    args = operands, g_view, g_shape, g_supports, g_count, n_bcast_dims

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1912,12 +1912,15 @@ def mv(x, vec, name=None):
 def det(x, name=None):
    """
    Calculates determinant value of a square matrix or batches of square matrices.
    Args:
-        x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the
+        x (Tensor): the input matrix of size `(n, n)` or the
            batch of matrices of size `(*, n, n)` where `*` is one or more
            batch dimensions.
+        name(str, optional): Name of the output. Default is None. It's used
+            to print debug info for developers. Details: :ref:`api_guide_Name`
    Returns:
        Tensor, the determinant value of a square matrix or batches of square matrices.
@@ -1968,18 +1971,20 @@ def det(x, name=None):
 def slogdet(x, name=None):
    """
    Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
-    The determinant can be computed with ``sign * exp(logabsdet)
+    The determinant can be computed with ``sign * exp`` (logabsdet)
    Supports input of float, double
    Note that for matrices that have zero determinant, this returns ``(0, -inf)``
    Args:
        x (Tensor): the batch of matrices of size :math:`(*, n, n)`
            where math:`*` is one or more batch dimensions.
    Returns:
-        y (Tensor): A tensor containing the sign of the determinant and the natural logarithm
+        y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
        of the absolute value of determinant, respectively.
    Examples:
@@ -2097,6 +2102,7 @@ def svd(x, full_matrices=False, name=None):
 def matrix_power(x, n, name=None):
    r"""
    Computes the n-th power of a square matrix or a batch of square matrices.
    Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
@@ -2122,7 +2128,7 @@ def matrix_power(x, n, name=None):
            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its
+        - Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its
          data type should be the same as that of `x`.
    Examples:
@@ -3058,8 +3064,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 def solve(x, y, name=None):
    r"""
    Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
-    Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
+    Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
    a vector/matrix or a batch of vectors/matrices, the equation should be:
    .. math::
@@ -3068,9 +3075,9 @@ def solve(x, y, name=None):
    Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.
    Args:
-        x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
+        x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or
            more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or
+        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
            more batch dimensions. Its data type should be float32 or float64.
        name(str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -223,7 +223,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
-    """
+    r"""
    stanh activation.
    .. math::
@@ -234,8 +235,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
        x (Tensor): The input Tensor with data type float32, float64.
        scale_a (float, optional): The scale factor a of the input. Default is 0.67.
        scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        A Tensor with the same data type and shape as ``x`` .

--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -1301,6 +1301,7 @@ def distribute_fpn_proposals(
    name=None,
 ):
    r"""
    In Feature Pyramid Networks (FPN) models, it is needed to distribute
    all proposals into different FPN level, with respect to scale of the proposals,
    the referring scale and the referring level. Besides, to restore the order of
@@ -1308,8 +1309,9 @@ def distribute_fpn_proposals(
    in current proposals. To compute FPN level for each roi, the formula is given as follows:
    .. math::
-        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
-        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+        level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)
    where BBoxArea is a function to compute the area of each roi.
    Args:
@@ -1333,11 +1335,11 @@ def distribute_fpn_proposals(
            None by default.
    Returns:
-        multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
+        - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
          and data type is same as `fpn_rois` . The length is max_level-min_level+1.
-        restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
+        - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
          , where N is the number of total rois. The data type is int32.
-        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is
+        - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
          the RoIs' number in each image on the corresponding level. The shape
          is [B] and data type of int32, where B is the number of images.
@@ -1356,6 +1358,7 @@ def distribute_fpn_proposals(
                refer_level=4,
                refer_scale=224,
                rois_num=rois_num)
    """
    num_lvl = max_level - min_level + 1
@@ -2441,6 +2444,7 @@ def matrix_nms(
    name=None,
 ):
    """
    This operator does matrix non maximum suppression (NMS).
    First selects a subset of candidate bounding boxes that have higher scores
    than score_threshold (if provided), then the top k candidate is selected if
@@ -2448,6 +2452,7 @@ def matrix_nms(
    decayed according to the Matrix NMS scheme.
    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
    per image if keep_top_k is larger than -1.
    Args:
        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
                           predicted locations of M bounding bboxes,
@@ -2471,29 +2476,32 @@ def matrix_nms(
                         on score_threshold.
        keep_top_k (int): Number of total bboxes to be kept per image after NMS
                          step. -1 means keeping all bboxes after NMS step.
-        use_gaussian (bool): Use Gaussian as the decay function. Default: False
+        use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
-        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
+        gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
-        background_label (int): The index of background label, the background
+        background_label (int, optional): The index of background label, the background
                                label will be ignored. If set to -1, then all
                                categories will be considered. Default: 0
-        normalized (bool): Whether detections are normalized. Default: True
+        normalized (bool, optional): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
+        return_index(bool, optional): Whether return selected index. Default: False
-        return_rois_num(bool): whether return rois_num. Default: True
+        return_rois_num(bool, optional): whether return rois_num. Default: True
-        name(str): Name of the matrix nms op. Default: None.
+        name(str, optional): Name of the matrix nms op. Default: None.
    Returns:
-        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
+        - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
          otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
-        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
+        - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
          detection results.
-             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+          Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
-        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
+        - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
          selected indices, which are absolute values cross batches.
-        rois_num (Tensor): A 1-D Tensor with shape [N] containing
+        - rois_num (Tensor), A 1-D Tensor with shape [N] containing
          the number of detected boxes in each image.
    Examples:
        .. code-block:: python
            import paddle
            from paddle.vision.ops import matrix_nms
            boxes = paddle.rand([4, 1, 4])
            boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
            boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
@@ -2501,6 +2509,7 @@ def matrix_nms(
            out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
                                 score_threshold=0.5, post_threshold=0.1,
                                 nms_top_k=400, keep_top_k=200, normalized=False)
    """
    check_variable_and_dtype(
        bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'