diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 2a11dd7eace7f656b51fc10811fe77e26684a039..2cfded8c96013d5b32a3943f7f93cd6b49b9e24a 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -26,7 +26,6 @@ non_auto_func_called = True
 
 
 def __non_auto_func_called__(func):
-
     def __impl__(*args, **kwargs):
         global non_auto_func_called
         non_auto_func_called = False
@@ -112,14 +111,15 @@ class DistributedStrategy(object):
 
     def __init__(self):
         """
+
         DistributedStrategy is the main configuration entry for distributed training of Paddle.
         All of the distributed training configurations can be configured in DistributedStrategy,
-        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), 
+        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
         asynchronous update parameter server(ASGD), etc.
 
         DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file
 
-        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and 
+        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and
         DistributedStrategy supports configurations from BuildStrategy and ExecutionStrategy
 
         """
@@ -129,7 +129,8 @@ class DistributedStrategy(object):
         key = 'FLAGS_cudnn_batchnorm_spatial_persistent'
         if _global_flags().is_public(key):
             self.strategy.cudnn_batchnorm_spatial_persistent = bool(
-                _global_flags()[key])
+                _global_flags()[key]
+            )
         key = 'FLAGS_conv_workspace_size_limit'
         if _global_flags().is_public(key):
             self.strategy.conv_workspace_size_limit = int(_global_flags()[key])
@@ -144,43 +145,47 @@ class DistributedStrategy(object):
 
     def __setattr__(self, key, value):
         if self.__lock_attr and not hasattr(self, key):
-            raise TypeError("%s is not a attribute of %s" %
-                            (key, self.__class__.__name__))
+            raise TypeError(
+                "%s is not a attribute of %s" % (key, self.__class__.__name__)
+            )
         object.__setattr__(self, key, value)
 
     def save_to_prototxt(self, output):
         """
+
         Serialize current DistributedStrategy to string and save to output file
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.dgc = True
+                strategy.recompute = True
+                strategy.recompute_configs = {"checkpoints": ["x"]}
+                strategy.save_to_prototxt("dist_strategy.prototxt")
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.dgc = True
-            strategy.recompute = True
-            strategy.recompute_configs = {"checkpoints": ["x"]}
-            strategy.save_to_prototxt("dist_strategy.prototxt")
         """
         with open(output, "w") as fout:
             fout.write(str(self.strategy))
 
     def load_from_prototxt(self, pb_file):
         """
+
         Load from prototxt file for DistributedStrategy initialization
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.load_from_prototxt("dist_strategy.prototxt")
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.load_from_prototxt("dist_strategy.prototxt")
         """
         with open(pb_file, 'r') as f:
             self.strategy = google.protobuf.text_format.Merge(
-                str(f.read()), self.strategy)
+                str(f.read()), self.strategy
+            )
 
     @property
     def execution_strategy(self):
@@ -188,23 +193,26 @@ class DistributedStrategy(object):
         Configure ExecutionStrategy for DistributedStrategy
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                exe_strategy = paddle.static.ExecutionStrategy()
+                exe_strategy.num_threads = 10
+                exe_strategy.num_iteration_per_drop_scope = 10
+                exe_strategy.num_iteration_per_run = 10
 
-            import paddle
-            exe_strategy = paddle.static.ExecutionStrategy()
-            exe_strategy.num_threads = 10
-            exe_strategy.num_iteration_per_drop_scope = 10
-            exe_strategy.num_iteration_per_run = 10
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.execution_strategy = exe_strategy
 
-            strategy = paddle.distributed.fleet.DistributedStrategy()
-            strategy.execution_strategy = exe_strategy
         """
         execution_strategy = paddle.fluid.ExecutionStrategy()
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
-            setattr(execution_strategy, f.name,
-                    getattr(self.strategy.execution_strategy, f.name))
+            setattr(
+                execution_strategy,
+                f.name,
+                getattr(self.strategy.execution_strategy, f.name),
+            )
         return execution_strategy
 
     @execution_strategy.setter
@@ -212,33 +220,37 @@ class DistributedStrategy(object):
     def execution_strategy(self, strategy):
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
-            setattr(self.strategy.execution_strategy, f.name,
-                    getattr(strategy, f.name))
+            setattr(
+                self.strategy.execution_strategy,
+                f.name,
+                getattr(strategy, f.name),
+            )
 
     @property
     def build_strategy(self):
         """
+
         Configure BuildStrategy for DistributedStrategy
         Note that the properties of BuildStrategy are valid in DistributedStrategy
         only if the property is non-distributed strategy.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.enable_sequential_execution = True
+                build_strategy.fuse_elewise_add_act_ops = True
+                build_strategy.fuse_bn_act_ops = True
+                build_strategy.enable_auto_fusion = True
+                build_strategy.fuse_relu_depthwise_conv = True
+                build_strategy.fuse_broadcast_ops = True
+                build_strategy.fuse_all_optimizer_ops = True
+                build_strategy.enable_inplace = True
 
-            import paddle
-            build_strategy = paddle.static.BuildStrategy()
-            build_strategy.enable_sequential_execution = True
-            build_strategy.fuse_elewise_add_act_ops = True
-            build_strategy.fuse_bn_act_ops = True
-            build_strategy.enable_auto_fusion = True
-            build_strategy.fuse_relu_depthwise_conv = True
-            build_strategy.fuse_broadcast_ops = True
-            build_strategy.fuse_all_optimizer_ops = True
-            build_strategy.enable_inplace = True
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.build_strategy = build_strategy
 
-            strategy = paddle.distributed.fleet.DistributedStrategy()
-            strategy.build_strategy = build_strategy
         """
 
         build_strategy = paddle.fluid.BuildStrategy()
@@ -261,52 +273,60 @@ class DistributedStrategy(object):
                     value = ReduceStrategyFleet(value)
                 setattr(self.strategy.build_strategy, f.name, value)
             elif f.label == 3:  # repeated field
-                getattr(self.strategy.build_strategy,
-                        f.name).extend(getattr(strategy, f.name))
+                getattr(self.strategy.build_strategy, f.name).extend(
+                    getattr(strategy, f.name)
+                )
 
     @property
     def gradient_scale_configs(self):
         """
+
         Set the strategy of gradient scale
+
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
 
         Note that, strategy must be in 'avg', 'sum' or 'customized'
+
         """
         return get_msg_dict(self.strategy.gradient_scale_configs)
 
     @gradient_scale_configs.setter
     @is_strict_auto
     def gradient_scale_configs(self, config):
-        check_configs_key(self.strategy.gradient_scale_configs, config,
-                          'gradient_scale_configs')
+        check_configs_key(
+            self.strategy.gradient_scale_configs,
+            config,
+            'gradient_scale_configs',
+        )
         assign_configs_value(self.strategy.gradient_scale_configs, config)
 
     @property
     def a_sync(self):
         """
+
         Indicating whether we are using asynchronous stocastic gradient descent updates
-        for training. This property is valid when we are using parameter server training, 
+        for training. This property is valid when we are using parameter server training,
         which is implied by setting approperate RoleMaker
         Default value: True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
 
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                strategy.a_sync = True  # by default this is True
 
-            strategy = fleet.DistributedStrategy()
-            strategy.a_sync = True  # by default this is True
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.a_sync
 
@@ -318,12 +338,15 @@ class DistributedStrategy(object):
             self.a_sync_configs = {"k_steps": 0}
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}"
-                .format(type(flag)))
+                "The type of `flag` is invalid, expected type is bool, but received {}".format(
+                    type(flag)
+                )
+            )
 
     @property
     def a_sync_configs(self):
         """
+
         Set a_sync update configurations. In general, asynchronous parameter server
         training has serveral configurable settings that can be configured through
         a dict.
@@ -344,20 +367,19 @@ class DistributedStrategy(object):
             runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
 
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                strategy.a_sync = True  # by default this is True
+                configs = {"k_steps": 1024, "send_queue_size": 32}
+                strategy.a_sync_configs = configs
 
-            strategy = fleet.DistributedStrategy()
-            strategy.a_sync = True  # by default this is True
-            configs = {"k_steps": 1024, "send_queue_size": 32}
-            strategy.a_sync_configs = configs
-
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return get_msg_dict(self.strategy.a_sync_configs)
@@ -365,14 +387,16 @@ class DistributedStrategy(object):
     @a_sync_configs.setter
     @is_strict_auto
     def a_sync_configs(self, configs):
-        check_configs_key(self.strategy.a_sync_configs, configs,
-                          "a_sync_configs")
+        check_configs_key(
+            self.strategy.a_sync_configs, configs, "a_sync_configs"
+        )
         assign_configs_value(self.strategy.a_sync_configs, configs)
 
     @property
     def trainer_desc_configs(self):
         """
-        Set trainer desc configurations. 
+
+        Set trainer desc configurations.
 
         **Notes**:
             dump_fields_path(str): the path of dump fields
@@ -381,22 +405,21 @@ class DistributedStrategy(object):
 
             dump_param(list(str)): the param that you want to dump
 
-            stat_var_names(list(str)): 
+            stat_var_names(list(str)):
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
 
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
+                strategy.trainer_desc_configs = configs
 
-            strategy = fleet.DistributedStrategy()
-            configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
-            strategy.trainer_desc_configs = configs
-
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return get_msg_dict(self.strategy.trainer_desc_configs)
@@ -404,22 +427,23 @@ class DistributedStrategy(object):
     @property
     def adam_d2sum(self):
         """
+
         set adam_d2sum
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
 
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                strategy.adam_d2sum = True  # by default this is False
 
-            strategy = fleet.DistributedStrategy()
-            strategy.adam_d2sum = True  # by default this is False
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.adam_d2sum
 
@@ -430,43 +454,55 @@ class DistributedStrategy(object):
             self.strategy.adam_d2sum = flag
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}"
-                .format(type(flag)))
+                "The type of `flag` is invalid, expected type is bool, but received {}".format(
+                    type(flag)
+                )
+            )
 
     @trainer_desc_configs.setter
     @is_strict_auto
     def trainer_desc_configs(self, configs):
-        check_configs_key(self.strategy.trainer_desc_configs, configs,
-                          "trainer_desc_configs")
+        check_configs_key(
+            self.strategy.trainer_desc_configs, configs, "trainer_desc_configs"
+        )
         assign_configs_value(self.strategy.trainer_desc_configs, configs)
 
     @property
     def fs_client_param(self):
         """
-        Set fs client configurations. 
-        **Notes**:
+
+        Set fs client configurations.
+
+        Note:
             uri(str): the uri of fs client
+
             user(str): the user_name of fs client
+
             passwd(str): the passwd of fs client
-            hadoop_bin(str): 
+
+            hadoop_bin(str):
+
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
-            strategy = fleet.DistributedStrategy()
-            configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"}
-            strategy.fs_client_param = configs
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"}
+                strategy.fs_client_param = configs
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
+
         """
         return self.strategy.fs_client_param
 
     @fs_client_param.setter
     @is_strict_auto
     def fs_client_param(self, configs):
-        check_configs_key(self.strategy.fs_client_param, configs,
-                          "fs_client_param")
+        check_configs_key(
+            self.strategy.fs_client_param, configs, "fs_client_param"
+        )
         assign_configs_value(self.strategy.fs_client_param, configs)
 
     @property
@@ -477,6 +513,7 @@ class DistributedStrategy(object):
     @is_strict_auto
     def sparse_table_configs(self, configs):
         from google.protobuf.descriptor import FieldDescriptor
+
         table_param = self.strategy.downpour_table_param
 
         def set_table_config(msg, config_name, configs, index=0):
@@ -493,8 +530,9 @@ class DistributedStrategy(object):
                             data = getattr(msg, field.name).add()
                             set_table_config(data, name, configs, i)
                     else:
-                        set_table_config(getattr(msg, field.name), name,
-                                         configs)
+                        set_table_config(
+                            getattr(msg, field.name), name, configs
+                        )
                 else:
                     # print("not message:", name)
                     if name not in configs:
@@ -513,133 +551,206 @@ class DistributedStrategy(object):
             for table_name in configs:
                 table_data = table_param.add()
                 table_data.table_name = table_name
-                set_table_config(table_data, "table_parameters." + table_name,
-                                 configs[table_name])
+                set_table_config(
+                    table_data,
+                    "table_parameters." + table_name,
+                    configs[table_name],
+                )
 
     @sparse_table_configs.setter
     def fleet_desc_configs(self, configs):
-        support_sparse_key_list = ['sparse_table_class', 'sparse_compress_in_save', 'sparse_shard_num', \
-                                   'sparse_accessor_class', 'sparse_learning_rate', 'sparse_initial_g2sum', 'sparse_initial_range', \
-                                   'sparse_weight_bounds', 'sparse_fea_dim', 'sparse_embedx_dim', 'sparse_embedx_threshold', 'sparse_nonclk_coeff', \
-                                   'sparse_click_coeff', 'sparse_base_threshold', 'sparse_delta_threshold', 'sparse_delta_keep_days', \
-                                   'sparse_delete_after_unseen_days', 'sparse_show_click_decay_rate', 'sparse_delete_threshold', \
-                                   'sparse_converter', 'sparse_deconverter', 'sparse_enable_cache', 'sparse_cache_rate', \
-                                   'sparse_cache_file_num', 'sparse_beta1_decay_rate', 'sparse_beta2_decay_rate', \
-                                   'sparse_ada_epsilon', 'sparse_optimizer', 'sparse_ssd_unseenday_threshold',
-                                   'embed_sparse_optimizer', 'embed_sparse_learning_rate', 'embed_sparse_weight_bounds', \
-                                   'embed_sparse_initial_range', 'embed_sparse_initial_g2sum', 'embed_sparse_beta1_decay_rate', \
-                                   'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \
-                                   'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \
-                                   'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot']
+        support_sparse_key_list = [
+            'sparse_table_class',
+            'sparse_compress_in_save',
+            'sparse_shard_num',
+            'sparse_accessor_class',
+            'sparse_learning_rate',
+            'sparse_initial_g2sum',
+            'sparse_initial_range',
+            'sparse_weight_bounds',
+            'sparse_fea_dim',
+            'sparse_embedx_dim',
+            'sparse_embedx_threshold',
+            'sparse_nonclk_coeff',
+            'sparse_click_coeff',
+            'sparse_base_threshold',
+            'sparse_delta_threshold',
+            'sparse_delta_keep_days',
+            'sparse_delete_after_unseen_days',
+            'sparse_show_click_decay_rate',
+            'sparse_delete_threshold',
+            'sparse_converter',
+            'sparse_deconverter',
+            'sparse_enable_cache',
+            'sparse_cache_rate',
+            'sparse_cache_file_num',
+            'sparse_beta1_decay_rate',
+            'sparse_beta2_decay_rate',
+            'sparse_ada_epsilon',
+            'sparse_optimizer',
+            'sparse_ssd_unseenday_threshold',
+            'embed_sparse_optimizer',
+            'embed_sparse_learning_rate',
+            'embed_sparse_weight_bounds',
+            'embed_sparse_initial_range',
+            'embed_sparse_initial_g2sum',
+            'embed_sparse_beta1_decay_rate',
+            'embed_sparse_beta2_decay_rate',
+            'embedx_sparse_optimizer',
+            'embedx_sparse_learning_rate',
+            'embedx_sparse_weight_bounds',
+            'embedx_sparse_initial_range',
+            'embedx_sparse_initial_g2sum',
+            'embedx_sparse_beta1_decay_rate',
+            'embedx_sparse_beta2_decay_rate',
+            'feature_learning_rate',
+            'nodeid_slot',
+        ]
         support_sparse_table_class = ['DownpourSparseTable']
         support_sparse_accessor_class = [
-            'DownpourSparseValueAccessor', 'DownpourCtrAccessor',
-            'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
-            'DownpourDoubleUnitAccessor', 'DownpourCtrDymfAccessor'
+            'DownpourSparseValueAccessor',
+            'DownpourCtrAccessor',
+            'DownpourCtrDoubleAccessor',
+            'DownpourUnitAccessor',
+            'DownpourDoubleUnitAccessor',
+            'DownpourCtrDymfAccessor',
         ]
         from google.protobuf.descriptor import FieldDescriptor
+
         table_param = self.strategy.downpour_table_param
 
         def add_graph_config(graph, strategy):
-            graph.feature_learning_rate = strategy.get('feature_learning_rate',
-                                                       0.05)
+            graph.feature_learning_rate = strategy.get(
+                'feature_learning_rate', 0.05
+            )
             graph.nodeid_slot = strategy.get('nodeid_slot', 9008)
 
         def sparse_optimizer_config(sgd, strategy, prefix):
-            optimizer_name = strategy.get(prefix + "sparse_optimizer",
-                                          "adagrad")
+            optimizer_name = strategy.get(
+                prefix + "sparse_optimizer", "adagrad"
+            )
             sgd.name = optimizer_name
             if optimizer_name == "naive":
                 sgd.name = "SparseNaiveSGDRule"
                 sgd.naive.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                 sgd.naive.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_initial_range', 1e-4
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.naive.weight_bounds.extend(bounds)
             elif optimizer_name == "adagrad":
                 sgd.name = 'SparseAdaGradSGDRule'
                 sgd.adagrad.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                 sgd.adagrad.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                 if prefix == "embed_":
                     sgd.adagrad.initial_range = 0
                 sgd.adagrad.initial_g2sum = strategy.get(
-                    prefix + 'sparse_initial_g2sum', 3)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_initial_g2sum', 3
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.adagrad.weight_bounds.extend(bounds)
             elif optimizer_name == "std_adagrad":
                 sgd.name = 'StdAdaGradSGDRule'
                 sgd.adagrad.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                 sgd.adagrad.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                 if prefix == "embed_":
                     sgd.adagrad.initial_range = 0
                 sgd.adagrad.initial_g2sum = strategy.get(
-                    prefix + 'sparse_initial_g2sum', 3)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_initial_g2sum', 3
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.adagrad.weight_bounds.extend(bounds)
             elif optimizer_name == "adam":
                 sgd.name = 'SparseAdamSGDRule'
                 sgd.adam.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.001)
+                    prefix + 'sparse_learning_rate', 0.001
+                )
                 sgd.adam.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                 sgd.adam.beta1_decay_rate = strategy.get(
-                    prefix + 'sparse_beta1_decay_rate', 0.9)
+                    prefix + 'sparse_beta1_decay_rate', 0.9
+                )
                 sgd.adam.beta2_decay_rate = strategy.get(
-                    prefix + 'sparse_beta2_decay_rate', 0.999)
+                    prefix + 'sparse_beta2_decay_rate', 0.999
+                )
                 sgd.adam.ada_epsilon = strategy.get(
-                    prefix + 'sparse_ada_epsilon', 1e-8)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_ada_epsilon', 1e-8
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.adam.weight_bounds.extend(bounds)
             elif optimizer_name == "shared_adam":
                 sgd.name = 'SparseSharedAdamSGDRule'
                 sgd.adam.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.001)
+                    prefix + 'sparse_learning_rate', 0.001
+                )
                 sgd.adam.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                 sgd.adam.beta1_decay_rate = strategy.get(
-                    prefix + 'sparse_beta1_decay_rate', 0.9)
+                    prefix + 'sparse_beta1_decay_rate', 0.9
+                )
                 sgd.adam.beta2_decay_rate = strategy.get(
-                    prefix + 'sparse_beta2_decay_rate', 0.999)
+                    prefix + 'sparse_beta2_decay_rate', 0.999
+                )
                 sgd.adam.ada_epsilon = strategy.get(
-                    prefix + 'sparse_ada_epsilon', 1e-8)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_ada_epsilon', 1e-8
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.adam.weight_bounds.extend(bounds)
 
         def set_sparse_table_config(table_data, config):
             for key in config:
                 if key not in support_sparse_key_list:
                     raise ValueError("strategy key '%s' not support" % (key))
-            table_class = config.get("sparse_table_class",
-                                     "DownpourSparseTable")
+            table_class = config.get(
+                "sparse_table_class", "DownpourSparseTable"
+            )
             if table_class not in support_sparse_table_class:
                 raise ValueError(
                     "support sparse_table_class: ['DownpourSparseTable'], but actual %s"
-                    % (table_class))
+                    % (table_class)
+                )
             table_data.table_class = 'MemorySparseTable'
             table_data.shard_num = config.get('sparse_shard_num', 1000)
             table_data.enable_sparse_table_cache = config.get(
-                'sparse_enable_cache', True)
+                'sparse_enable_cache', True
+            )
             table_data.sparse_table_cache_rate = config.get(
-                'sparse_cache_rate', 0.00055)
+                'sparse_cache_rate', 0.00055
+            )
             table_data.sparse_table_cache_file_num = config.get(
-                'sparse_cache_file_num', 16)
+                'sparse_cache_file_num', 16
+            )
 
-            accessor_class = config.get("sparse_accessor_class",
-                                        "DownpourCtrAccessor")
+            accessor_class = config.get(
+                "sparse_accessor_class", "DownpourCtrAccessor"
+            )
             if accessor_class not in support_sparse_accessor_class:
                 raise ValueError(
                     "support sparse_accessor_class: ['DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'], but actual %s"
-                    % (accessor_class))
+                    % (accessor_class)
+                )
 
             if accessor_class.find("Double") >= 0:
                 table_data.accessor.accessor_class = 'CtrDoubleAccessor'
@@ -654,7 +765,8 @@ class DistributedStrategy(object):
             table_data.accessor.embedx_dim = config.get('sparse_embedx_dim', 8)
             table_data.accessor.fea_dim = table_data.accessor.embedx_dim + 3
             table_data.accessor.embedx_threshold = config.get(
-                'sparse_embedx_threshold', 10)
+                'sparse_embedx_threshold', 10
+            )
 
             if accessor_class == 'DownpourUnitAccessor':
                 table_data.accessor.ctr_accessor_param.show_scale = False
@@ -662,23 +774,32 @@ class DistributedStrategy(object):
                 table_data.accessor.ctr_accessor_param.show_scale = True
 
             table_data.accessor.ctr_accessor_param.nonclk_coeff = config.get(
-                'sparse_nonclk_coeff', 0.1)
+                'sparse_nonclk_coeff', 0.1
+            )
             table_data.accessor.ctr_accessor_param.click_coeff = config.get(
-                'sparse_click_coeff', 1)
+                'sparse_click_coeff', 1
+            )
             table_data.accessor.ctr_accessor_param.base_threshold = config.get(
-                'sparse_base_threshold', 1.5)
+                'sparse_base_threshold', 1.5
+            )
             table_data.accessor.ctr_accessor_param.delta_threshold = config.get(
-                'sparse_delta_threshold', 0.25)
+                'sparse_delta_threshold', 0.25
+            )
             table_data.accessor.ctr_accessor_param.delta_keep_days = config.get(
-                'sparse_delta_keep_days', 16)
-            table_data.accessor.ctr_accessor_param.show_click_decay_rate = config.get(
-                'sparse_show_click_decay_rate', 0.98)
-            table_data.accessor.ctr_accessor_param.delete_threshold = config.get(
-                'sparse_delete_threshold', 0.8)
-            table_data.accessor.ctr_accessor_param.delete_after_unseen_days = config.get(
-                'sparse_delete_after_unseen_days', 30)
-            table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = config.get(
-                'sparse_ssd_unseenday_threshold', 1)
+                'sparse_delta_keep_days', 16
+            )
+            table_data.accessor.ctr_accessor_param.show_click_decay_rate = (
+                config.get('sparse_show_click_decay_rate', 0.98)
+            )
+            table_data.accessor.ctr_accessor_param.delete_threshold = (
+                config.get('sparse_delete_threshold', 0.8)
+            )
+            table_data.accessor.ctr_accessor_param.delete_after_unseen_days = (
+                config.get('sparse_delete_after_unseen_days', 30)
+            )
+            table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = (
+                config.get('sparse_ssd_unseenday_threshold', 1)
+            )
             converter = config.get('sparse_converter', "")
             deconverter = config.get('sparse_deconverter', "")
 
@@ -692,23 +813,33 @@ class DistributedStrategy(object):
             save_data2.converter = converter
             save_data2.deconverter = deconverter
 
-            if accessor_class == 'DownpourCtrAccessor' or accessor_class == 'DownpourCtrDoubleAccessor':
-                sparse_optimizer_config(table_data.accessor.embed_sgd_param,
-                                        config, '')
-                sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
-                                        config, '')
+            if (
+                accessor_class == 'DownpourCtrAccessor'
+                or accessor_class == 'DownpourCtrDoubleAccessor'
+            ):
+                sparse_optimizer_config(
+                    table_data.accessor.embed_sgd_param, config, ''
+                )
+                sparse_optimizer_config(
+                    table_data.accessor.embedx_sgd_param, config, ''
+                )
             else:
-                sparse_optimizer_config(table_data.accessor.embed_sgd_param,
-                                        config, 'embed_')
-                sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
-                                        config, 'embedx_')
+                sparse_optimizer_config(
+                    table_data.accessor.embed_sgd_param, config, 'embed_'
+                )
+                sparse_optimizer_config(
+                    table_data.accessor.embedx_sgd_param, config, 'embedx_'
+                )
             add_graph_config(table_data.accessor.graph_sgd_param, config)
 
         if not configs:
             print("fleet desc config is empty")
         else:
             for table_name in configs:
-                if table_name == 'dense_table' or table_name == 'datanorm_table':
+                if (
+                    table_name == 'dense_table'
+                    or table_name == 'datanorm_table'
+                ):
                     continue
                 if type(configs[table_name]) != dict:
                     continue
@@ -744,6 +875,7 @@ class DistributedStrategy(object):
     @property
     def amp_configs(self):
         """
+
         Set automatic mixed precision training configurations. In general, amp has serveral configurable
         settings that can be configured through a dict.
 
@@ -772,28 +904,27 @@ class DistributedStrategy(object):
                    Default True. Only takes effect when `use_pure_fp16` is turned on.
 
         Examples 1:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.amp = True
-            strategy.amp_configs = {
-                "init_loss_scaling": 32768,
-                "custom_white_list": ['conv2d']}
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.amp = True
+                strategy.amp_configs = {
+                    "init_loss_scaling": 32768,
+                    "custom_white_list": ['conv2d']}
 
         Examples 2:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.amp = True
+                # pure fp16
+                strategy.amp_configs = {
+                    "init_loss_scaling": 32768,
+                    "use_pure_fp16": True
+                }
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.amp = True
-            # pure fp16
-            strategy.amp_configs = {
-                "init_loss_scaling": 32768,
-                "use_pure_fp16": True
-            }
         """
         return get_msg_dict(self.strategy.amp_configs)
 
@@ -806,16 +937,16 @@ class DistributedStrategy(object):
     @property
     def asp(self):
         """
+
         Indicating whether we are using automatic sparsity training
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.asp = True # by default this is false
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.asp = True # by default this is false
 
         """
         return self.strategy.asp
@@ -835,30 +966,31 @@ class DistributedStrategy(object):
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.recompute = True
+                # suppose x and y are names of checkpoint tensors for recomputation
+                strategy.recompute_configs = {"checkpoints": ["x", "y"]}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.recompute = True
-            # suppose x and y are names of checkpoint tensors for recomputation
-            strategy.recompute_configs = {"checkpoints": ["x", "y"]}
         """
         return self.strategy.recompute
 
     @property
     def sync_nccl_allreduce(self):
         """
+
         Indicating whether we are using synchronized all reduce in each communication thread
         We note that system overhead is usually lower when sync_nccl_allreduce = True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.sync_nccl_allreduce = True
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.sync_nccl_allreduce = True
         """
         return self.strategy.sync_nccl_allreduce
 
@@ -873,17 +1005,18 @@ class DistributedStrategy(object):
     @property
     def use_hierarchical_allreduce(self):
         """
+
         Indicating whether we are using hierarchical allreduce in collective communication
         Hierarchical allreduce often does allreduce within a certain node group and then do
         allreduce among the leaders of each group
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.use_hierarchical_allreduce = True
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.use_hierarchical_allreduce = True
         """
         return self.strategy.use_hierarchical_allreduce
 
@@ -900,16 +1033,17 @@ class DistributedStrategy(object):
     @property
     def hierarchical_allreduce_inter_nranks(self):
         """
+
         Number of ranks for low level node groups in hierarchical allreduce
         Default value: number of GPU cards on each single GPU machine
 
         Example:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.hierarchical_allreduce_inter_nranks = 8
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.hierarchical_allreduce_inter_nranks = 8
         """
         return self.strategy.hierarchical_allreduce_inter_nranks
 
@@ -926,17 +1060,18 @@ class DistributedStrategy(object):
     @property
     def sync_batch_norm(self):
         """
+
         Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
 
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.sync_batch_norm = True
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.sync_batch_norm = True
         """
 
         return self.strategy.sync_batch_norm
@@ -952,16 +1087,17 @@ class DistributedStrategy(object):
     @property
     def fuse_all_reduce_ops(self):
         """
+
         Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
         Default value: True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.fuse_all_reduce_ops = False
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fuse_all_reduce_ops = False
         """
         return self.strategy.fuse_all_reduce_ops
 
@@ -976,17 +1112,18 @@ class DistributedStrategy(object):
     @property
     def fuse_grad_size_in_MB(self):
         """
+
         Specifying the size of gradient to fuse in Mega-Bytes
 
         Default value: 32
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.fuse_grad_size_in_MB = 50
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fuse_grad_size_in_MB = 50
         """
         return self.strategy.fuse_grad_size_in_MB
 
@@ -1001,18 +1138,20 @@ class DistributedStrategy(object):
     @property
     def last_comm_group_size_MB(self):
         """
-        Specifying the size of gradient to fuse in Mega-Bytes when 
-        the last group of each batch communicates. Making the last group 
-        small is useful to improve performance. 
+
+        Specifying the size of gradient to fuse in Mega-Bytes when
+        the last group of each batch communicates. Making the last group
+        small is useful to improve performance.
 
         Default value: 1
 
         Examples:
-          .. code-block:: python
-        
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.last_comm_group_size_MB = 2
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.last_comm_group_size_MB = 2
+
         """
         return self.strategy.last_comm_group_size_MB
 
@@ -1027,18 +1166,19 @@ class DistributedStrategy(object):
     @property
     def find_unused_parameters(self):
         """
-        Indicating whether we are using find_unused_parameters to 
+
+        Indicating whether we are using find_unused_parameters to
         find unused parameters in DataParallel.
 
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.find_unused_parameters = True
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.find_unused_parameters = True
         """
 
         return self.strategy.find_unused_parameters
@@ -1070,17 +1210,18 @@ class DistributedStrategy(object):
     @property
     def nccl_comm_num(self):
         """
+
         Specifying the number of NCCL communicator
 
         Default value: 1
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.nccl_comm_num = 2
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.nccl_comm_num = 2
         """
 
         return self.strategy.nccl_comm_num
@@ -1104,32 +1245,32 @@ class DistributedStrategy(object):
     @property
     def recompute_configs(self):
         """
-        Set recompute configurations. 
-        
+
+        Set recompute configurations.
+
         **Note**:
         checkpoints(list): list of string name of checkpoints. In general, the recompute
         strategy of current implementation should have some manually assign checkpoints.
 
-        enable_offload(bool): enable recompute checkpoints offload feature. this feature 
+        enable_offload(bool): enable recompute checkpoints offload feature. this feature
         will offload the checkpoint to host memory to allow even larger batch size. since
         the memcpy from host to device takes time, it is a trade off between larger batch
         size and training speed.
 
         checkpoint_shape(list): list of int that specific the shape of checkpoint. so far
         recompute-offload requires that all checkpoint to be same shape, and every dimension
-        specific here should be determined ("-1" is not allowed). 
+        specific here should be determined ("-1" is not allowed).
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.recompute = True
-            strategy.recompute_configs = {
-                "checkpoints": ["x", "y"],
-                "enable_offload": True,
-                "checkpoint_shape": [100, 512, 1024] }
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.recompute = True
+                strategy.recompute_configs = {
+                    "checkpoints": ["x", "y"],
+                    "enable_offload": True,
+                    "checkpoint_shape": [100, 512, 1024] }
 
         """
         return get_msg_dict(self.strategy.recompute_configs)
@@ -1137,15 +1278,17 @@ class DistributedStrategy(object):
     @recompute_configs.setter
     @is_strict_auto
     def recompute_configs(self, configs):
-        check_configs_key(self.strategy.recompute_configs, configs,
-                          "checkpoint_configs")
+        check_configs_key(
+            self.strategy.recompute_configs, configs, "checkpoint_configs"
+        )
         assign_configs_value(self.strategy.recompute_configs, configs)
 
     @property
     def sharding(self):
         """
+
         Indicating whether we are using sharding Optimizer for memory
-        optimization. We implement the sharding optimizer following the ZeRO-DP 
+        optimization. We implement the sharding optimizer following the ZeRO-DP
         idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
         Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.
 
@@ -1154,12 +1297,12 @@ class DistributedStrategy(object):
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.sharding = True
 
-            import paddle.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.sharding = True
         """
         return self.strategy.sharding
 
@@ -1174,26 +1317,27 @@ class DistributedStrategy(object):
     @property
     def sharding_configs(self):
         """
-        Set sharding configurations. 
+
+        Set sharding configurations.
 
         **Note**:
-            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are 
-            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and 
+            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are
+            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and
             communication. Default is segment_broadcast_MB.
 
-            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and 
+            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and
             after every segment_broadcast_MB size parameter being broadcasted, the program will be cutted into one segment.
             This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
             Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
 
-            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation. 
+            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation.
             this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
 
             sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.
 
             gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1.  Default is 1.
 
-            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model. 
+            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model.
             the moment var will be prefetch from and offloaded to Host memory during update stage. it is a stragtegy that trades off between training speed and GPU memory, and is recommened to be turn on only when gradient_merge_acc_step large, where
             the number of time of update stage will be relatively small compared with forward&backward's.  Default is False.
 
@@ -1203,7 +1347,7 @@ class DistributedStrategy(object):
 
             pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
 
-            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
+            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on.
             This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
 
             optimize_cast(bool, optional): [Hybrid parallelism ONLY] Move the cast op of AMP which cast fp32 param to fp16 param to optimizer. optimize_cast will persist fp16 param, it
@@ -1211,42 +1355,43 @@ class DistributedStrategy(object):
 
 
         Examples:
+            .. code-block:: python
+
+                # sharding-DP, 2 nodes with 8 gpus per node
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.sharding = True
+                strategy.sharding_configs = {
+                    "sharding_segment_strategy": "segment_broadcast_MB",
+                    "segment_broadcast_MB": 32,
+                    "sharding_degree": 8,
+                    "dp_degree": 2,
+                    "gradient_merge_acc_step": 4,
+                    }
 
-          .. code-block:: python
-
-            # sharding-DP, 2 nodes with 8 gpus per node
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.sharding = True
-            strategy.sharding_configs = {
-                "sharding_segment_strategy": "segment_broadcast_MB",
-                "segment_broadcast_MB": 32,
-                "sharding_degree": 8,
-                "dp_degree": 2,
-                "gradient_merge_acc_step": 4,
-                }
         """
         return get_msg_dict(self.strategy.sharding_configs)
 
     @sharding_configs.setter
     @is_strict_auto
     def sharding_configs(self, configs):
-        check_configs_key(self.strategy.sharding_configs, configs,
-                          "sharding_configs")
+        check_configs_key(
+            self.strategy.sharding_configs, configs, "sharding_configs"
+        )
         assign_configs_value(self.strategy.sharding_configs, configs)
 
     @property
     def without_graph_optimization(self):
         """
+
         Run program using Executor other than ParallelExecutor.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.without_graph_optimization = True
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.without_graph_optimization = True
 
         """
         return self.strategy.without_graph_optimization
@@ -1264,14 +1409,18 @@ class DistributedStrategy(object):
     @property
     def _calc_comm_same_stream(self):
         """
+
         This based on raw_program_optimizer program
         Set whether use same stream for calc and comm when fuse allreduce
         The default value for the calc_comm_same_stream is False
+
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.calc_comm_same_stream = True
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.calc_comm_same_stream = True
+
         """
         return self.strategy.calc_comm_same_stream
 
@@ -1288,14 +1437,18 @@ class DistributedStrategy(object):
     @property
     def fuse_grad_merge(self):
         """
+
         Set whether fuse the grad for gradient merge.
         Note: this flag will only effect the gradient merge under pipeline mode
         The default value for the fuse_grad_merge is False
+
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fuse_param_grad = True
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.fuse_param_grad = True
+
         """
         return self.strategy.fuse_grad_merge
 
@@ -1310,12 +1463,17 @@ class DistributedStrategy(object):
     @property
     def fuse_grad_size_in_num(self):
         """
+
         This based on raw_program_optimizer program and allreduce the num of the fused op
+
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fuse_grad_size_in_num = 2
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+
+                strategy = fleet.DistributedStrategy()
+                strategy.fuse_grad_size_in_num = 2
+
         """
         return self.strategy.fuse_grad_size_in_num
 
@@ -1332,18 +1490,18 @@ class DistributedStrategy(object):
     @property
     def pipeline(self):
         """
+
         Indicating whether we are using pipeline parallelism for distributed training.
         Current implementation mainly focus on single GPU machine pipeline parallelism and
         data parallelism across GPU machine. The pipeline information is indicated through
         device_guard information in user-defined program.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.pipeline = True
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.pipeline = True
 
         """
         return self.strategy.pipeline
@@ -1383,13 +1541,14 @@ class DistributedStrategy(object):
     @property
     def pipeline_configs(self):
         """
+
         Set pipeline parallelism configurations. In pipeline parallelism,
         different parts of neural networks are running on different GPUS.
-        There are Tensor queue buffer between each pair of neighborhood GPUS 
+        There are Tensor queue buffer between each pair of neighborhood GPUS
         that are responsible for synchronizing hidden Tensor results between
         GPUs. Pipeline parallelism consists of serveral producer-consumer style
         hardware pairs, such as GPU-GPU, CPU-GPU, GPU-XPU. The best way to speedup
-        pipeline parallelism is to make the size of Tensor in Tensor queue smaller, 
+        pipeline parallelism is to make the size of Tensor in Tensor queue smaller,
         so that we will have a faster producer for downstream consumers.
 
         **Notes**:
@@ -1398,13 +1557,12 @@ class DistributedStrategy(object):
             **micro_batch_size**: the number of small batches in each user defined batch
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.pipeline = True
-            strategy.pipeline_configs = {"micro_batch_size": 12}
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.pipeline = True
+                strategy.pipeline_configs = {"micro_batch_size": 12}
 
         """
 
@@ -1413,22 +1571,23 @@ class DistributedStrategy(object):
     @pipeline_configs.setter
     @is_strict_auto
     def pipeline_configs(self, configs):
-        check_configs_key(self.strategy.pipeline_configs, configs,
-                          "pipeline_configs")
+        check_configs_key(
+            self.strategy.pipeline_configs, configs, "pipeline_configs"
+        )
         assign_configs_value(self.strategy.pipeline_configs, configs)
 
     @property
     def tensor_parallel(self):
         """
+
         Indicating whether we are using tensor parallel for distributed training.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.tensor_parallel = True
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.tensor_parallel = True
 
         """
         return self.strategy.tensor_parallel
@@ -1444,23 +1603,25 @@ class DistributedStrategy(object):
     @property
     def tensor_parallel_configs(self):
         """
+
         Set tensor_parallel configurations.
 
         **Notes**:
             **Detailed arguments for tensor_parallel_configs**
+
             **tensor_parallel_degree**: degree of tensor parallel
+
             **tensor_init_seed**: parameter initialization random seed
 
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.tensor_parallel = True
-            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
-                                                "tensor_init_seed": 123}
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.tensor_parallel = True
+                strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
+                                                    "tensor_init_seed": 123}
 
         """
         return get_msg_dict(self.strategy.tensor_parallel_configs)
@@ -1468,59 +1629,67 @@ class DistributedStrategy(object):
     @tensor_parallel_configs.setter
     @is_strict_auto
     def tensor_parallel_configs(self, configs):
-        check_configs_key(self.strategy.tensor_parallel_configs, configs,
-                          "tensor_parallel_configs")
+        check_configs_key(
+            self.strategy.tensor_parallel_configs,
+            configs,
+            "tensor_parallel_configs",
+        )
         assign_configs_value(self.strategy.tensor_parallel_configs, configs)
 
     @property
     def hybrid_configs(self):
         """
-        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism 
+
+        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
         needs to meet the following relationships
 
         total_number_GPUs = dp_degree * mp_degree * pp_degree
 
         **Note**:
-            dp_degree(int): set number of GPUs in a data parallel group. Default -1.
+            **dp_degree(int)**: set number of GPUs in a data parallel group. Default -1.
                                     This value should be an integer greater than 0.
-                                    If it is not set, or set to -1, its value will be inferred 
+                                    If it is not set, or set to -1, its value will be inferred
                                     based on the total number of cards.
-            mp_degree(int): set number of GPUs in a model parallel group. Default 1
-            pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
 
+            **mp_degree(int)**: set number of GPUs in a model parallel group. Default 1
+
+            **pp_degree(int)**: set number of GPUs in a pipeline parallel group. Default 1
 
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.hybrid_configs = {
-                "dp_degree": 1,
-                "mp_degree": 2,
-                "pp_degree": 1}
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.hybrid_configs = {
+                    "dp_degree": 1,
+                    "mp_degree": 2,
+                    "pp_degree": 1}
+
         """
         return get_msg_dict(self.strategy.hybrid_configs)
 
     @hybrid_configs.setter
     def hybrid_configs(self, configs):
-        check_configs_key(self.strategy.hybrid_configs, configs,
-                          "hybrid_configs")
+        check_configs_key(
+            self.strategy.hybrid_configs, configs, "hybrid_configs"
+        )
         assign_configs_value(self.strategy.hybrid_configs, configs)
 
     @property
     def localsgd(self):
         """
+
         Indicating whether we are using Local SGD training. Default Value: False
         For more details, please refer to
         `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
 
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.localsgd = True # by default this is false
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.localsgd = True # by default this is false
 
         """
         return self.strategy.localsgd
@@ -1536,6 +1705,7 @@ class DistributedStrategy(object):
     @property
     def localsgd_configs(self):
         """
+
         Set LocalSGD training configurations. LocalSGD has a configurable
         setting that can be configured through a dict.
 
@@ -1544,14 +1714,14 @@ class DistributedStrategy(object):
             begin_step(int) The step of beginning training by localsgd. Default 1.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.localsgd = True
+                strategy.localsgd_configs = {"k_steps": 4,
+                                            "begin_step": 30}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.localsgd = True
-            strategy.localsgd_configs = {"k_steps": 4,
-                                         "begin_step": 30}
         """
 
         return get_msg_dict(self.strategy.localsgd_configs)
@@ -1559,25 +1729,25 @@ class DistributedStrategy(object):
     @localsgd_configs.setter
     @is_strict_auto
     def localsgd_configs(self, configs):
-        check_configs_key(self.strategy.localsgd_configs, configs,
-                          "localsgd_configs")
+        check_configs_key(
+            self.strategy.localsgd_configs, configs, "localsgd_configs"
+        )
         assign_configs_value(self.strategy.localsgd_configs, configs)
 
     @property
     def adaptive_localsgd(self):
         """
+
         Indicating whether we are using Adaptive Local SGD training. Default Value: False
-        For more details, please refer to `Adaptive Communication Strategies to Achieve 
+        For more details, please refer to `Adaptive Communication Strategies to Achieve
         the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
 
-
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.adaptive_localsgd = True # by default this is false
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.adaptive_localsgd = True # by default this is false
 
         """
         return self.strategy.adaptive_localsgd
@@ -1593,6 +1763,7 @@ class DistributedStrategy(object):
     @property
     def adaptive_localsgd_configs(self):
         """
+
         Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
         setting that can be configured through a dict.
 
@@ -1600,17 +1771,18 @@ class DistributedStrategy(object):
             init_k_steps(int) The initial steps for training before adaptive localsgd.
                               Then, the adaptive localsgd method will modify init_k_steps automatically.
                               Default 1.
+
             begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.adaptive_localsgd = True
+                strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
+                                                    "begin_step": 30}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.adaptive_localsgd = True
-            strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
-                                                  "begin_step": 30}
         """
 
         return get_msg_dict(self.strategy.adaptive_localsgd_configs)
@@ -1618,25 +1790,28 @@ class DistributedStrategy(object):
     @adaptive_localsgd_configs.setter
     @is_strict_auto
     def adaptive_localsgd_configs(self, configs):
-        check_configs_key(self.strategy.adaptive_localsgd_configs, configs,
-                          "adaptive_localsgd_configs")
+        check_configs_key(
+            self.strategy.adaptive_localsgd_configs,
+            configs,
+            "adaptive_localsgd_configs",
+        )
         assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
 
     @property
     def dgc(self):
         """
+
         Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
         [Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
 
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.dgc = True # by default this is false
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.dgc = True # by default this is false
 
         """
         return self.strategy.dgc
@@ -1652,6 +1827,7 @@ class DistributedStrategy(object):
     @property
     def dgc_configs(self):
         r"""
+
         Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
         settings that can be configured through a dict.
 
@@ -1668,13 +1844,13 @@ class DistributedStrategy(object):
                     element will be transmitted.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.dgc = True
+                strategy.dgc_configs = {"rampup_begin_step": 1252}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.dgc = True
-            strategy.dgc_configs = {"rampup_begin_step": 1252}
         """
         return get_msg_dict(self.strategy.dgc_configs)
 
@@ -1687,16 +1863,17 @@ class DistributedStrategy(object):
     @property
     def fp16_allreduce(self):
         """
+
         Indicating whether we are using fp16 gradient allreduce training
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fp16_allreduce = True # by default this is false
+                strategy = fleet.DistributedStrategy()
+                strategy.fp16_allreduce = True # by default this is false
 
         """
         return self.strategy.fp16_allreduce
@@ -1711,6 +1888,7 @@ class DistributedStrategy(object):
     @property
     def gradient_merge(self):
         """
+
         Gradient Merge, also called as Gradient Accumulation,
         is a strategy for large batch training. With this strategy,
         model parameter will not be updated until user-defined steps.
@@ -1721,13 +1899,13 @@ class DistributedStrategy(object):
         to model parameters.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.gradient_merge = True
+                strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.gradient_merge = True
-            strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
         """
         return self.strategy.gradient_merge
 
@@ -1742,6 +1920,7 @@ class DistributedStrategy(object):
     @property
     def gradient_merge_configs(self):
         """
+
         the key-value configs of distribute_strategy
 
         **Note**:
@@ -1750,39 +1929,41 @@ class DistributedStrategy(object):
             avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.gradient_merge = True
+                strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.gradient_merge = True
-            strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
         """
         return get_msg_dict(self.strategy.gradient_merge_configs)
 
     @gradient_merge_configs.setter
     @is_strict_auto
     def gradient_merge_configs(self, configs):
-        check_configs_key(self.strategy.gradient_merge_configs, configs,
-                          "gradient_configs")
+        check_configs_key(
+            self.strategy.gradient_merge_configs, configs, "gradient_configs"
+        )
         assign_configs_value(self.strategy.gradient_merge_configs, configs)
 
     @property
     def lars(self):
         """
-        Set lars configurations. lars is used to deal with the convergence problems when the global 
-        batch size is larger than 8k.  For more details, please refer to 
+
+        Set lars configurations. lars is used to deal with the convergence problems when the global
+        batch size is larger than 8k.  For more details, please refer to
         [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
 
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.lars = True # by default this is false
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.lars = True # by default this is false
         """
         return self.strategy.lars
 
@@ -1797,29 +1978,30 @@ class DistributedStrategy(object):
     @property
     def lars_configs(self):
         """
+
         Set Lars training configurations.
 
         **Notes**:
         **lars_coeff (float)**: trust ratio in lars formula.
         **lars_weight_decay** (float): weight decay coefficient in lars formula.
-        **epsilon (float)**: argument is used to avoid potential devision-by-zero 
-        when compute the local lr; 
+        **epsilon (float)**: argument is used to avoid potential devision-by-zero
+        when compute the local lr;
         **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
         will be exclude from weight decay in lars formula.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.lars = True
+                strategy.lars_configs = {
+                            "lars_coeff": 0.01,
+                            "lars_weight_decay": 0.0005,
+                            "epsilon": 0,
+                            "exclude_from_weight_decay": ['batch_norm', '.b_0']
+                        }
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.lars = True
-            strategy.lars_configs = {
-                        "lars_coeff": 0.01,
-                        "lars_weight_decay": 0.0005,
-                        "epsilon": 0,
-                        "exclude_from_weight_decay": ['batch_norm', '.b_0']
-                    }
         """
         return get_msg_dict(self.strategy.lars_configs)
 
@@ -1832,20 +2014,21 @@ class DistributedStrategy(object):
     @property
     def lamb(self):
         """
-        Set lamb configurations. lamb is used to deal with the convergence problems for large 
-        batch size training, specially for attention-related model like BERT. For more details, 
-        please refer to 
+
+        Set lamb configurations. lamb is used to deal with the convergence problems for large
+        batch size training, specially for attention-related model like BERT. For more details,
+        please refer to
         [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
 
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.lamb = True # by default this is false
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.lamb = True # by default this is false
         """
 
         return self.strategy.lamb
@@ -1861,6 +2044,7 @@ class DistributedStrategy(object):
     @property
     def lamb_configs(self):
         """
+
         Set Lars training configurations.
 
         **Notes**:
@@ -1869,16 +2053,16 @@ class DistributedStrategy(object):
         will be exclude from weight decay in lamb formula.
 
         Examples:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.lamb = True
+                strategy.lamb_configs = {
+                        'lamb_weight_decay': 0.01,
+                        'exclude_from_weight_decay': [],
+                    }
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.lamb = True
-            strategy.lamb_configs = {
-                    'lamb_weight_decay': 0.01,
-                    'exclude_from_weight_decay': [],
-                }
         """
         return get_msg_dict(self.strategy.lamb_configs)
 
@@ -1891,8 +2075,10 @@ class DistributedStrategy(object):
     @property
     def elastic(self):
         """
+
         Indicating whether we want to do current distributed training on clusters with elastic resources.
         Currently, this is configuration is not valid.
+
         """
         return self.strategy.elastic
 
@@ -1907,28 +2093,29 @@ class DistributedStrategy(object):
     @property
     def auto(self):
         """
+
         Indicating whether we are using auto-parallel configuration
-        This feature is currently an experimental feature. Currently, 
+        This feature is currently an experimental feature. Currently,
         auto-parallelism can be used only when a user does not set any other
         strategy configs except auto. For details, please reference the following
         code example
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.auto = True
+                # if set other strategy at the same time, auto will not apply
+                # strategy.amp = True
 
-            strategy = fleet.DistributedStrategy()
-            strategy.auto = True
-            # if set other strategy at the same time, auto will not apply
-            # strategy.amp = True
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.auto
 
@@ -1942,28 +2129,29 @@ class DistributedStrategy(object):
     @property
     def semi_auto(self):
         """
+
         Indicating whether we are using semi-auto parallel function
-        This feature is currently an experimental feature. Currently, 
+        This feature is currently an experimental feature. Currently,
         auto-parallelism can be used only when a user does not set any other
         strategy configs except semi-auto. For details, please reference the following
         code example
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.semi_auto = True
+                # if set other strategy at the same time, auto will not apply
+                # strategy.amp = True
 
-            strategy = fleet.DistributedStrategy()
-            strategy.semi_auto = True
-            # if set other strategy at the same time, auto will not apply
-            # strategy.amp = True
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.semi_auto
 
@@ -1977,16 +2165,21 @@ class DistributedStrategy(object):
     @property
     def auto_search(self):
         """
+
         Indicating whether we are using auto-search parallel function
         For details, please reference the following code example
         Default Value: False
+
         Examples:
-          .. code-block:: python
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.auto_search = True
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.auto_search = True
+
         """
         return self.strategy.auto_search
 
@@ -2000,15 +2193,20 @@ class DistributedStrategy(object):
     @property
     def split_data(self):
         """
+
         Indicating whether we split the data. If True, we split the data.
         Default Value: True
+
         Examples:
-          .. code-block:: python
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.split_data = True
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.split_data = True
+
         """
         return self.strategy.split_data
 
@@ -2022,8 +2220,10 @@ class DistributedStrategy(object):
     @property
     def qat(self):
         """
+
         Indicating whether we are using quantization training
         Default Value: False
+
         """
         return self.strategy.qat
 
@@ -2037,6 +2237,7 @@ class DistributedStrategy(object):
     @property
     def qat_configs(self):
         """
+
         Set quantization training configurations. In general, qat has serveral configurable
         settings that can be configured through a dict.
 
@@ -2047,23 +2248,23 @@ class DistributedStrategy(object):
 
             activation_bits(int): quantization bit number for activation. Default is 8.
 
-            not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope, 
+            not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
                 the corresponding op will not be quantized.
 
             algo(str): Other quantization training algorithm.
 
         Exampless:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.qat = True
-            strategy.qat_configs = {
-                "channel_wise_abs_max": True,
-                "weight_bits": 8,
-                "activation_bits: 8,
-                "not_quant_pattern": ['skip_quant']}
+                strategy = fleet.DistributedStrategy()
+                strategy.qat = True
+                strategy.qat_configs = {
+                    "channel_wise_abs_max": True,
+                    "weight_bits": 8,
+                    "activation_bits: 8,
+                    "not_quant_pattern": ['skip_quant']}
 
         """
         return get_msg_dict(self.strategy.qat_configs)
@@ -2076,24 +2277,25 @@ class DistributedStrategy(object):
     @property
     def heter_ccl_mode(self):
         """
+
         Indicating whether we are using heter_ccl_mode for model training.
         This feature is currently an experimental feature. Currently,
         heter_ccl_mode can be used only for dataparallel with dygraph mode.
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.heter_ccl_mode = True
 
-            strategy = fleet.DistributedStrategy()
-            strategy.heter_ccl_mode = True
+                # for initialize parallel env, only need to call
+                paddle.distributed.init_parallel_env()
+                # then the heterogenous context will be created.
 
-            # for initialize parallel env, only need to call
-            paddle.distributed.init_parallel_env()
-            # then the heterogenous context will be created.
         """
         return self.strategy.heter_ccl_mode
 
@@ -2107,6 +2309,7 @@ class DistributedStrategy(object):
     @property
     def cudnn_exhaustive_search(self):
         """
+
         Indicating whether to use exhaustive search method to choose convolution algorithms.
         Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
         This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
@@ -2114,17 +2317,18 @@ class DistributedStrategy(object):
         Default Value: True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.cudnn_exhaustive_search = False
+                strategy = fleet.DistributedStrategy()
+                strategy.cudnn_exhaustive_search = False
+
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.cudnn_exhaustive_search
 
@@ -2141,6 +2345,7 @@ class DistributedStrategy(object):
     @property
     def conv_workspace_size_limit(self):
         """
+
         The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
         The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
         Usually, large workspace size may lead to choose faster algorithms,
@@ -2148,17 +2353,17 @@ class DistributedStrategy(object):
         Default Value: 4000
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.conv_workspace_size_limit = 1024
+                strategy = fleet.DistributedStrategy()
+                strategy.conv_workspace_size_limit = 1024
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.conv_workspace_size_limit
@@ -2176,22 +2381,23 @@ class DistributedStrategy(object):
     @property
     def cudnn_batchnorm_spatial_persistent(self):
         """
+
         Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
         This is only useful in cudnn.
         Default Value: True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.cudnn_batchnorm_spatial_persistent = True
+                strategy = fleet.DistributedStrategy()
+                strategy.cudnn_batchnorm_spatial_persistent = True
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.cudnn_batchnorm_spatial_persistent
@@ -2244,7 +2450,8 @@ class DistributedStrategy(object):
 
         h1_format = "    " + "|{{:^{}s}}|\n".format(length)
         h2_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
-            max_k, " " * spacing, max_v)
+            max_k, " " * spacing, max_v
+        )
 
         border = "    +" + "".join(["="] * length) + "+"
         line = "    +" + "".join(["-"] * length) + "+"
@@ -2269,37 +2476,48 @@ class DistributedStrategy(object):
                         if getattr(self.strategy, f.name):
                             draws += border + "\n"
                             draws += h1_format.format(
-                                "{}=True <-> {}_configs".format(f.name, f.name))
+                                "{}=True <-> {}_configs".format(f.name, f.name)
+                            )
                             draws += line + "\n"
-                            my_configs = getattr(self.strategy,
-                                                 f.name + "_configs")
+                            my_configs = getattr(
+                                self.strategy, f.name + "_configs"
+                            )
                             config_fields = my_configs.DESCRIPTOR.fields
                             for ff in config_fields:
                                 if isinstance(
-                                        getattr(my_configs,
-                                                ff.name), google.protobuf.pyext.
-                                        _message.RepeatedScalarContainer):
+                                    getattr(my_configs, ff.name),
+                                    google.protobuf.pyext._message.RepeatedScalarContainer,
+                                ):
                                     values = getattr(my_configs, ff.name)
                                     for i, v in enumerate(values):
                                         if i == 0:
                                             draws += h2_format.format(
-                                                ff.name, str(v))
+                                                ff.name, str(v)
+                                            )
                                         else:
                                             draws += h2_format.format(
-                                                "", str(v))
+                                                "", str(v)
+                                            )
                                 else:
                                     draws += h2_format.format(
                                         ff.name,
-                                        str(getattr(my_configs, ff.name)))
+                                        str(getattr(my_configs, ff.name)),
+                                    )
                     else:
                         env_draws += h2_format.format(
-                            f.name, str(getattr(self.strategy, f.name)))
+                            f.name, str(getattr(self.strategy, f.name))
+                        )
                 else:
                     env_draws += h2_format.format(
-                        f.name, str(getattr(self.strategy, f.name)))
-
-        result_res = draws + border + "\n" + h1_format.format(
-            "Environment Flags, Communication Flags")
+                        f.name, str(getattr(self.strategy, f.name))
+                    )
+
+        result_res = (
+            draws
+            + border
+            + "\n"
+            + h1_format.format("Environment Flags, Communication Flags")
+        )
         result_res += env_draws
 
         build_strategy_str = border + "\n"
@@ -2309,7 +2527,8 @@ class DistributedStrategy(object):
         fields = self.strategy.build_strategy.DESCRIPTOR.fields
         for f in fields:
             build_strategy_str += h2_format.format(
-                f.name, str(getattr(self.strategy.build_strategy, f.name)))
+                f.name, str(getattr(self.strategy.build_strategy, f.name))
+            )
         build_strategy_str += border + "\n"
 
         execution_strategy_str = h1_format.format("Execution Strategy")
@@ -2318,7 +2537,8 @@ class DistributedStrategy(object):
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
             execution_strategy_str += h2_format.format(
-                f.name, str(getattr(self.strategy.execution_strategy, f.name)))
+                f.name, str(getattr(self.strategy.execution_strategy, f.name))
+            )
         execution_strategy_str += border + "\n"
 
         result_res += build_strategy_str + execution_strategy_str
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 305452e99f380dcbf14ff9db1108382c4c730881..7ad6ce3bd0033b5148884aa06a5ec9566bc7cc19 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -28,12 +28,13 @@ _HYBRID_PARALLEL_GROUP = None
 
 class ParallelMode(object):
     """
+
     There are all the parallel modes currently supported:
-    - DATA_PARALLEL: Distribute input data to different devices.
-    - TENSOR_PARALLEL: Shards tensors in the network to different devices.
-    - PIPELINE_PARALLEL: Place different layers of the network on different devices.
-    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states 
-                         corresponding to the parameters to each device.
+
+        - DATA_PARALLEL: Distribute input data to different devices.
+        - TENSOR_PARALLEL: Shards tensors in the network to different devices.
+        - PIPELINE_PARALLEL: Place different layers of the network on different devices.
+        - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
 
     Examples:
         .. code-block:: python
@@ -43,6 +44,7 @@ class ParallelMode(object):
             print(parallel_mode.DATA_PARALLEL)  # 0
 
     """
+
     DATA_PARALLEL = 0
     TENSOR_PARALLEL = 1
     PIPELINE_PARALLEL = 2
@@ -50,14 +52,16 @@ class ParallelMode(object):
 
 
 class CommunicateTopology(object):
-
-    def __init__(self,
-                 hybrid_group_names=["data", "pipe", "sharding", "model"],
-                 dims=[1, 1, 1, 1]):
+    def __init__(
+        self,
+        hybrid_group_names=["data", "pipe", "sharding", "model"],
+        dims=[1, 1, 1, 1],
+    ):
         self._parallel_names = hybrid_group_names
         self._dims = dims
-        self.coordinate = collections.namedtuple('Coordinate',
-                                                 self._parallel_names)
+        self.coordinate = collections.namedtuple(
+            'Coordinate', self._parallel_names
+        )
         self._world_size = reduce(lambda x, y: x * y, self._dims)
 
         ranges = [range(d) for d in self._dims]
@@ -65,7 +69,8 @@ class CommunicateTopology(object):
 
         self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
         self._rank2coord = dict(
-            zip(self._coord2rank.values(), self._coord2rank.keys()))
+            zip(self._coord2rank.values(), self._coord2rank.keys())
+        )
 
     def get_hybrid_group_names(self):
         return self._parallel_names
@@ -90,7 +95,8 @@ class CommunicateTopology(object):
     def get_axis_list(self, axis_name, index):
         axis = self._parallel_names.index(axis_name)
         ranks = [
-            self._coord2rank[coord] for coord in self._coord2rank.keys()
+            self._coord2rank[coord]
+            for coord in self._coord2rank.keys()
             if coord[axis] == index
         ]
         ranks.sort()
@@ -132,7 +138,6 @@ class CommunicateTopology(object):
 
 
 class HybridCommunicateGroup(object):
-
     def __init__(self, topology):
         self.nranks = paddle.distributed.get_world_size()
         self.global_rank = paddle.distributed.get_rank()
@@ -148,10 +153,16 @@ class HybridCommunicateGroup(object):
         self._sharding_parallel_id = self._get_sharding_parallel_id()
         self.stage_id = self._get_pipe_parallel_id()
 
-        assert self._check_vaild_topo(
-        ), "Here is an unreasonable topogy setting. world_size: {}, but" \
-            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(self.nranks,
-            self._mp_degree, self._sharding_degree, self._pp_degree, self._dp_degree)
+        assert self._check_vaild_topo(), (
+            "Here is an unreasonable topogy setting. world_size: {}, but"
+            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(
+                self.nranks,
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
 
         # create comm group for data parallel
         self._dp_group, self._dp_comm_group = self._set_comm_group("data")
@@ -164,26 +175,43 @@ class HybridCommunicateGroup(object):
 
         # create comm group for sharding parallel
         self._sharding_group, self._sharding_comm_group = self._set_comm_group(
-            "sharding")
+            "sharding"
+        )
 
         # create global group for check inf_nan / clip global norm
         self._check_group, self._check_comm_group = self._set_check_group(
-            "data")
+            "data"
+        )
 
         # create p2p group
-        self.is_first_stage = (self.stage_id == 0)
-        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+        self.is_first_stage = self.stage_id == 0
+        self.is_last_stage = self.stage_id == (self._pp_degree - 1)
 
         # create p2p_groups
         if self._pp_degree > 1:
             self._set_p2p_group()
 
-        debug_str = "HybridParallelInfo: rank_id: %d, mp_degree: %d, " \
-                    "sharding_degree: %d, pp_degree: %d, dp_degree: %d" % (self.global_rank, self._mp_degree,
-                    self._sharding_degree, self._pp_degree, self._dp_degree)
-        debug_str += ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s" % (
-            self._mp_group, self._sharding_group, self._pp_group,
-            self._dp_group, self._check_group)
+        debug_str = (
+            "HybridParallelInfo: rank_id: %d, mp_degree: %d, "
+            "sharding_degree: %d, pp_degree: %d, dp_degree: %d"
+            % (
+                self.global_rank,
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
+        debug_str += (
+            ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s"
+            % (
+                self._mp_group,
+                self._sharding_group,
+                self._pp_group,
+                self._dp_group,
+                self._check_group,
+            )
+        )
         logger.info(debug_str)
 
         global _HYBRID_PARALLEL_GROUP
@@ -195,7 +223,12 @@ class HybridCommunicateGroup(object):
         # adding its parallel logic within that parallelism
         # when use sharding alone, it should have its own parallelism for its parallel logic
         # TODO modify 3 others parallel to support sharding
-        if self._mp_degree == 1 and self._pp_degree == 1 and self._dp_degree == 1 and self._sharding_degree > 1:
+        if (
+            self._mp_degree == 1
+            and self._pp_degree == 1
+            and self._dp_degree == 1
+            and self._sharding_degree > 1
+        ):
             return ParallelMode.SHARDING_PARALLEL
         elif self._mp_degree == 1 and self._pp_degree == 1:
             return ParallelMode.DATA_PARALLEL
@@ -206,7 +239,13 @@ class HybridCommunicateGroup(object):
             return ParallelMode.PIPELINE_PARALLEL
 
     def _check_vaild_topo(self):
-        return self._dp_degree * self._mp_degree * self._pp_degree * self._sharding_degree == self.nranks
+        return (
+            self._dp_degree
+            * self._mp_degree
+            * self._pp_degree
+            * self._sharding_degree
+            == self.nranks
+        )
 
     def _set_comm_group(self, parallel_method="data"):
         parallel_group = []
@@ -268,14 +307,16 @@ class HybridCommunicateGroup(object):
                     self.prev_rank = prev_rank
 
                 next_group = paddle.distributed.new_group(
-                    ranks=[curr_rank, next_rank])
+                    ranks=[curr_rank, next_rank]
+                )
                 if self.global_rank == curr_rank:
                     self.send_next_group = next_group
                 elif self.global_rank == next_rank:
                     self.recv_prev_group = next_group
 
                 prev_group = paddle.distributed.new_group(
-                    ranks=[prev_rank, curr_rank])
+                    ranks=[prev_rank, curr_rank]
+                )
 
                 if self.global_rank == curr_rank:
                     self.send_prev_group = prev_group
@@ -339,7 +380,12 @@ class HybridCommunicateGroup(object):
         return self._pp_comm_group
 
     def get_p2p_groups(self):
-        return self.send_next_group, self.send_prev_group, self.recv_next_group, self.recv_prev_group
+        return (
+            self.send_next_group,
+            self.send_prev_group,
+            self.recv_next_group,
+            self.recv_prev_group,
+        )
 
     # sharding parallel message:
     def _get_sharding_parallel_id(self):
@@ -363,23 +409,25 @@ class HybridCommunicateGroup(object):
         return self._check_comm_group
 
     def get_rank_from_stage(self, stage_id, **kwargs):
-        return self._topo.get_rank_from_stage(self.global_rank,
-                                              pipe=stage_id,
-                                              **kwargs)
+        return self._topo.get_rank_from_stage(
+            self.global_rank, pipe=stage_id, **kwargs
+        )
 
 
 class _CommunicateGroup(object):
-    """ tmp for static """
+    """tmp for static"""
 
     def __init__(self):
         global _HYBRID_PARALLEL_GROUP
         _HYBRID_PARALLEL_GROUP = self
         self.groups = dict()
 
-    def set_comm_group(self, group_name, group_rank, group_size, ring_id,
-                       group_ranks):
-        group = paddle.distributed.collective.Group(group_rank, ring_id,
-                                                    group_ranks)
+    def set_comm_group(
+        self, group_name, group_rank, group_size, ring_id, group_ranks
+    ):
+        group = paddle.distributed.collective.Group(
+            group_rank, ring_id, group_ranks
+        )
         self.groups[group_name] = group
 
     def get_group(self, group_name):
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 35bac463b981db6122297eabc4de6207c3ce5b17..8c7187236d47c36e2ba5aebc25febf7f4841b997 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -103,6 +103,7 @@ def _check_var_exists(var_name):
 
 def init_parallel_env():
     """
+
     Initialize parallel training environment in dynamic graph mode.
 
     Note:
@@ -118,6 +119,7 @@ def init_parallel_env():
 
     Examples:
         .. code-block:: python
+
             # required: gpu
             import paddle
             import paddle.nn as nn
@@ -158,6 +160,7 @@ def init_parallel_env():
 
             if __name__ == '__main__':
                 dist.spawn(train)
+
     """
 
     # 0. get env & check world size
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 5cbc8f5e3beca8404113e93e4cc79e154b03b77a..bf3f2403ade33da405cf6b694c16c9d986a1838c 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -51,61 +51,76 @@ __all__ = [
 def _check_normalization(norm):
     if norm not in ['forward', 'backward', 'ortho']:
         raise ValueError(
-            "Unexpected norm: {}. Norm should be forward, backward or ortho".
-            format(norm))
+            "Unexpected norm: {}. Norm should be forward, backward or ortho".format(
+                norm
+            )
+        )
 
 
 def _check_fft_n(n):
     if not isinstance(n, int):
         raise ValueError(
-            "Invalid FFT argument n({}), it shoule be an integer.".format(n))
+            "Invalid FFT argument n({}), it shoule be an integer.".format(n)
+        )
     if n <= 0:
         raise ValueError(
-            "Invalid FFT argument n({}), it should be positive.".format(n))
+            "Invalid FFT argument n({}), it should be positive.".format(n)
+        )
 
 
 def _check_fft_shape(x, s):
     ndim = x.ndim
     if not isinstance(s, Sequence):
         raise ValueError(
-            "Invaid FFT argument s({}), it should be a sequence of integers.")
+            "Invaid FFT argument s({}), it should be a sequence of integers."
+        )
 
     if len(s) > ndim:
         raise ValueError(
             "Length of FFT argument s should not be larger than the rank of input. "
-            "Received s: {}, rank of x: {}".format(s, ndim))
+            "Received s: {}, rank of x: {}".format(s, ndim)
+        )
     for size in s:
         if not isinstance(size, int) or size <= 0:
-            raise ValueError("FFT sizes {} contains invalid value ({})".format(
-                s, size))
+            raise ValueError(
+                "FFT sizes {} contains invalid value ({})".format(s, size)
+            )
 
 
 def _check_fft_axis(x, axis):
     ndim = x.ndim
     if not isinstance(axis, int):
         raise ValueError(
-            "Invalid FFT axis ({}), it shoule be an integer.".format(axis))
+            "Invalid FFT axis ({}), it shoule be an integer.".format(axis)
+        )
     if axis < -ndim or axis >= ndim:
         raise ValueError(
             "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
-                axis, ndim, ndim))
+                axis, ndim, ndim
+            )
+        )
 
 
 def _check_fft_axes(x, axes):
     ndim = x.ndim
     if not isinstance(axes, Sequence):
         raise ValueError(
-            "Invalid FFT axes ({}), it should be a sequence of integers.".
-            format(axes))
+            "Invalid FFT axes ({}), it should be a sequence of integers.".format(
+                axes
+            )
+        )
     if len(axes) > ndim:
         raise ValueError(
             "Length of fft axes should not be larger than the rank of input. "
-            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
+            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim)
+        )
     for axis in axes:
         if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
             raise ValueError(
-                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})"
-                .format(axes, axis, ndim, ndim))
+                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".format(
+                    axes, axis, ndim, ndim
+                )
+            )
 
 
 def _resize_fft_input(x, s, axes):
@@ -127,10 +142,12 @@ def _resize_fft_input(x, s, axes):
             slices.append((0, s[i]))
 
     if axes_to_slice:
-        x = paddle.slice(x,
-                         axes_to_slice,
-                         starts=[item[0] for item in slices],
-                         ends=[item[1] for item in slices])
+        x = paddle.slice(
+            x,
+            axes_to_slice,
+            starts=[item[0] for item in slices],
+            ends=[item[1] for item in slices],
+        )
     if axes_to_pad:
         padding_widths = [0] * (2 * ndim)
         for axis, pad in zip(axes_to_pad, paddings):
@@ -146,8 +163,9 @@ def _normalize_axes(x, axes):
 
 def _check_at_least_ndim(x, rank):
     if x.ndim < rank:
-        raise ValueError("The rank of the input ({}) should >= {}".format(
-            x.ndim, rank))
+        raise ValueError(
+            "The rank of the input ({}) should >= {}".format(x.ndim, rank)
+        )
 
 
 # public APIs 1d
@@ -155,30 +173,30 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Calculate one-dimensional discrete Fourier transform.
 
-    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to 
+    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to
     calculate the 1-D * n * point discrete Fourier transform (DFT).
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
+        n (int, optional): The length of the output transform axis. If `n` is less than
+            the length input, the input will be cropped. If larger, the input is filled
+            with zeros. If `n` is not given, the input length along the axis specified
             by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated
         by `axis`, or the last one if `axis` is not specified.
-    
+
     Examples:
 
         .. code-block:: python
@@ -197,13 +215,9 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
 
     """
     if is_integer(x) or is_floating_point(x):
-        return fft_r2c(x,
-                       n,
-                       axis,
-                       norm,
-                       forward=True,
-                       onesided=False,
-                       name=name)
+        return fft_r2c(
+            x, n, axis, norm, forward=True, onesided=False, name=name
+        )
     else:
         return fft_c2c(x, n, axis, norm, forward=True, name=name)
 
@@ -212,7 +226,7 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Compute the 1-D inverse discrete Fourier Transform.
 
-    This function computes the inverse of the 1-D *n*-point discrete Fourier transform 
+    This function computes the inverse of the 1-D *n*-point discrete Fourier transform
     computed by `fft`.  In other words, ``ifft(fft(x)) == x`` to within numerical accuracy.
 
     The input should be ordered in the same way as is returned by `fft`,
@@ -225,27 +239,27 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
 
     For an even number of input points, ``x[n//2]`` represents the sum of
     the values at the positive and negative Nyquist frequencies, as the two
-    are aliased together. 
+    are aliased together.
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
+        n (int, optional): The length of the output transform axis. If `n` is less than
+            the length input, the input will be cropped. If larger, the input is filled
+            with zeros. If `n` is not given, the input length along the axis specified
             by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated
         by `axis`, or the last one if `axis` is not specified.
 
     Examples:
@@ -266,13 +280,9 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
 
     """
     if is_integer(x) or is_floating_point(x):
-        return fft_r2c(x,
-                       n,
-                       axis,
-                       norm,
-                       forward=False,
-                       onesided=False,
-                       name=name)
+        return fft_r2c(
+            x, n, axis, norm, forward=False, onesided=False, name=name
+        )
     else:
         return fft_c2c(x, n, axis, norm, forward=False, name=name)
 
@@ -286,40 +296,40 @@ def rfft(x, n=None, axis=-1, norm="backward", name=None):
     called the Fast Fourier Transform (FFT).
 
     When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
+    Hermitian-symmetric. This function does not compute the negative frequency
+    terms, and the length of the transformed axis of the output is therefore
     ``n//2 + 1``.
 
     Args:
-        x(Tensor) : Real-valued input tensor 
-        n(int, optional): Number of points along transformation axis in the 
-            input to use. If `n` is smaller than the length of the input, the 
-            input is cropped. If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
+        x(Tensor) : Real-valued input tensor
+        n(int, optional): Number of points along transformation axis in the
+            input to use. If `n` is smaller than the length of the input, the
+            input is cropped. If it is larger, the input is padded with zeros.
+            If `n` is not given, the length of the input along the axis
             specified by `axis` is used.
-        axis(int, optional): Axis over which to compute the FFT. Default value 
+        axis(int, optional): Axis over which to compute the FFT. Default value
             is last axis.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward  pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward  pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor
 
     Examples:
-    
+
     .. code-block:: python
-    
+
         import paddle
 
         x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
@@ -334,38 +344,38 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Computes the inverse of `rfft`.
 
-    This function calculates the inverse of the one-dimensional *n* point discrete 
-    Fourier transform of the actual input calculated by "rfft". In other words, 
+    This function calculates the inverse of the one-dimensional *n* point discrete
+    Fourier transform of the actual input calculated by "rfft". In other words,
     ``irfft(rfft(a),len(a)) == a`` is within the numerical accuracy range.
 
-    The input shall be in the form of "rfft", i.e. the actual zero frequency term, 
-    followed by the complex positive frequency term, in the order of increasing frequency. 
-    Because the discrete Fourier transform of the actual input is Hermite symmetric, 
-    the negative frequency term is regarded as the complex conjugate term of the corresponding 
+    The input shall be in the form of "rfft", i.e. the actual zero frequency term,
+    followed by the complex positive frequency term, in the order of increasing frequency.
+    Because the discrete Fourier transform of the actual input is Hermite symmetric,
+    the negative frequency term is regarded as the complex conjugate term of the corresponding
     positive frequency term.
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
         n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given,
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified
             along the ` axis'.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis.
         If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1``
         in some cases.
-    
+
     Examples:
 
         .. code-block:: python
@@ -389,25 +399,25 @@ def hfft(x, n=None, axis=-1, norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
         n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given,
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified
             along the ` axis'.
-        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
-        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in 
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis.
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in
         some cases.
-    
+
     Examples:
 
         .. code-block:: python
@@ -428,40 +438,40 @@ def ihfft(x, n=None, axis=-1, norm="backward", name=None):
     """
     The inverse FFT of a signal that has Hermitian symmetry.
 
-    This function computes the one dimensional *n*-point inverse FFT of a signal 
-    that has Hermitian symmetry by means of an efficient algorithm called 
+    This function computes the one dimensional *n*-point inverse FFT of a signal
+    that has Hermitian symmetry by means of an efficient algorithm called
     the Fast Fourier Transform (FFT).
 
     When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
+    Hermitian-symmetric. This function does not compute the negative frequency
+    terms, and the length of the transformed axis of the output is therefore
     ``n//2 + 1``.
 
     Args:
         x(Tensor): Input tensor.
-        n(int, optional): The number of points along transformation axis in the 
-            input to use.  If `n` is smaller than the length of the input, the 
-            input is cropped.  If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
+        n(int, optional): The number of points along transformation axis in the
+            input to use.  If `n` is smaller than the length of the input, the
+            input is cropped.  If it is larger, the input is padded with zeros.
+            If `n` is not given, the length of the input along the axis
             specified by `axis` is used.
         axis(int, optional) : Axis over which to compute the inverse FFT. If not
             given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor.
 
     Examples:
-    
+
     .. code-block:: python
-    
-        import paddle 
+
+        import paddle
 
         spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
         print(paddle.fft.ifft(spectrum))
@@ -480,7 +490,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
     """
     Compute the N-D discrete Fourier Transform.
 
-    This function calculates the n-D discrete Fourier transform on any number of axes 
+    This function calculates the n-D discrete Fourier transform on any number of axes
     in the M-D array by fast Fourier transform (FFT).
 
     Args:
@@ -493,20 +503,20 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used.
         axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
+            axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by
         `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
+
     Examples:
 
         .. code-block:: python
@@ -536,13 +546,9 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
     """
     if is_integer(x) or is_floating_point(x):
-        return fftn_r2c(x,
-                        s,
-                        axes,
-                        norm,
-                        forward=True,
-                        onesided=False,
-                        name=name)
+        return fftn_r2c(
+            x, s, axes, norm, forward=True, onesided=False, name=name
+        )
     else:
         return fftn_c2c(x, s, axes, norm, forward=True, name=name)
 
@@ -573,20 +579,20 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used.
         axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
+            axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-        
+
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by
         `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
+
     Examples:
 
         .. code-block:: python
@@ -608,19 +614,16 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             #          (-0.1666666716337204+0.28867512941360474j)]])
     """
     if is_integer(x) or is_floating_point(x):
-        return fftn_r2c(x,
-                        s,
-                        axes,
-                        norm,
-                        forward=False,
-                        onesided=False,
-                        name=name)
+        return fftn_r2c(
+            x, s, axes, norm, forward=False, onesided=False, name=name
+        )
     else:
         return fftn_c2c(x, s, axes, norm, forward=False, name=name)
 
 
 def rfftn(x, s=None, axes=None, norm="backward", name=None):
     """
+
     The N dimensional FFT for real input.
 
     This function computes the N-dimensional discrete Fourier Transform over
@@ -637,64 +640,63 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
 
     Args:
         x(Tensor) : Input tensor, taken to be real.
-        s(Sequence[int], optional) : Shape to use from the exec fft. The final element of 
-            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
-            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
-            the given shape is smaller than that of the input, the input is 
-            cropped.  If it is larger, the input is padded with zeros. if `s` is 
-            not given, the shape of the input along the axes specified by `axes` 
+        s(Sequence[int], optional) : Shape to use from the exec fft. The final element of
+            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining
+            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if
+            the given shape is smaller than that of the input, the input is
+            cropped.  If it is larger, the input is padded with zeros. if `s` is
+            not given, the shape of the input along the axes specified by `axes`
             is used.
-        axes(Sequence[int], optional) : Axes over which to compute the FFT.  If not given, 
-            the last ``len(s)`` axes are used, or all axes if `s` is also not 
+        axes(Sequence[int], optional) : Axes over which to compute the FFT.  If not given,
+            the last ``len(s)`` axes are used, or all axes if `s` is also not
             specified.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward". The details of 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
+            default value is "backward". The details of
             three operations are shown below:
-            
-                - "backward": The factor of forward direction and backward direction are ``1`` 
-                and ``1/n`` respectively;
-                - "forward": The factor of forward direction and backward direction are ``1/n`` 
-                and ``1`` respectively;
+
+                - "backward": The factor of forward direction and backward direction are ``1``
+                  and ``1/n`` respectively;
+                - "forward": The factor of forward direction and backward direction are ``1/n``
+                  and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
-        out(Tensor): complex tensor
+        out(Tensor), complex tensor
 
     Examples:
-    
-    .. code-block:: python
-    
-        import paddle
+        .. code-block:: python
+
+            import paddle
 
-        # default, all axis will be used to exec fft
-        x = paddle.ones((2, 3, 4))
-        print(paddle.fft.rfftn(x))
-        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(24+0j), 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]],
-        #
-        #         [[0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]]])
-
-        # use axes(2, 0)
-        print(paddle.fft.rfftn(x, axes=(2, 0)))
-        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(8+0j), 0j     , 0j     ],
-        #          [(8+0j), 0j     , 0j     ],
-        #          [(8+0j), 0j     , 0j     ]],
-        #
-        #         [[0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]]])
+            # default, all axis will be used to exec fft
+            x = paddle.ones((2, 3, 4))
+            print(paddle.fft.rfftn(x))
+            # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[[(24+0j), 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ]],
+            #
+            #         [[0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ]]])
+
+            # use axes(2, 0)
+            print(paddle.fft.rfftn(x, axes=(2, 0)))
+            # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[[(8+0j), 0j     , 0j     ],
+            #          [(8+0j), 0j     , 0j     ],
+            #          [(8+0j), 0j     , 0j     ]],
+            #
+            #         [[0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ]]])
 
     """
     return fftn_r2c(x, s, axes, norm, forward=True, onesided=True, name=name)
@@ -717,37 +719,37 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). 
-            
-            - `s` is also the number of input points used along this axis, except for the last axis, where ``s[-1]//2+1`` points of the input are used. 
-            - Along any axis, if the shape indicated by `s` is smaller than that of the input, the input is cropped. If it is larger, the input is padded with zeros. 
-            - If `s` is not given, the shape of the input along the axes specified by axes is used. Except for the last axis which is taken to be ``2*(k-1)`` 
-            
+        s (sequence of ints, optional): The length of the output transform axis.
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+
+            - `s` is also the number of input points used along this axis, except for the last axis, where ``s[-1]//2+1`` points of the input are used.
+            - Along any axis, if the shape indicated by `s` is smaller than that of the input, the input is cropped. If it is larger, the input is padded with zeros.
+            - If `s` is not given, the shape of the input along the axes specified by axes is used. Except for the last axis which is taken to be ``2*(k-1)``
+
             where ``k`` is the length of the input along that axis.
-            
+
         axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
+            `len(s)` axes are used, or all axes if `s` is also not specified.
         norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward". The details of 
+            pair and what normalization factor to use. The parameter value must be one
+            of "forward" or "backward" or "ortho". Default is "backward". The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or by a combination of `s` or `x`, as explained in the parameters section above. The length of 
+        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
+        or by a combination of `s` or `x`, as explained in the parameters section above. The length of
         each transformed axis is as given by the corresponding element of `s`, or the length of the input
         in every axis except for the last one if `s` is not given. In the final transformed axis the length
-        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final 
-        transformed axis of the input. To get an odd number of output points in the final axis, 
+        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final
+        transformed axis of the input. To get an odd number of output points in the final axis,
         `s` must be specified.
 
     Examples:
@@ -760,12 +762,12 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
             print(x)
             irfftn_x = paddle.fft.irfftn(x)
             print(irfftn_x)
-            
+
             # Tensor(shape=[3], dtype=complex128, place=Place(cpu), stop_gradient=True,
             #        [(2+2j), (2+2j), (3+3j)])
             # Tensor(shape=[4], dtype=float64, place=Place(cpu), stop_gradient=True,
             #        [ 2.25000000, -1.25000000,  0.25000000,  0.75000000])
-    
+
     """
     return fftn_c2r(x, s, axes, norm, forward=False, name=name)
 
@@ -775,35 +777,35 @@ def hfftn(x, s=None, axes=None, norm="backward", name=None):
     Compute the N-D FFT of Hermitian symmetric complex input, i.e., a
     signal with a real spectrum.
 
-    This function calculates the n-D discrete Fourier transform of Hermite symmetric 
-    complex input on any axis in M-D array by fast Fourier transform (FFT). 
-    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range. 
-    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary 
+    This function calculates the n-D discrete Fourier transform of Hermite symmetric
+    complex input on any axis in M-D array by fast Fourier transform (FFT).
+    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range.
+    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary
     for the same reason that ``irfft` requires ``x.shape``.)
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
+        s (sequence of ints, optional): The length of the output transform axis.
             (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
             number of input points used along this axis, except for the last axis,
-            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
-            the shape indicated by `s` is smaller than that of the input, the input 
-            is cropped. If it is larger, the input is padded with zeros. 
-            If `s` is not given, the shape of the input along the axes specified by axes 
-            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if
+            the shape indicated by `s` is smaller than that of the input, the input
+            is cropped. If it is larger, the input is padded with zeros.
+            If `s` is not given, the shape of the input along the axes specified by axes
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where
             ``k`` is the length of the input along that axis.
         axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
+            `len(s)` axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or 
+        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or
         a combination of `s` or `X`.
-    
+
     Examples:
 
         .. code-block:: python
@@ -823,36 +825,36 @@ def ihfftn(x, s=None, axes=None, norm="backward", name=None):
     """
     The n dimensional inverse FFT of a signal that has Hermitian symmetry.
 
-    This function computes the n dimensional inverse FFT over any number of axes 
-    in an M-dimensional of a signal that has Hermitian symmetry by means of an 
+    This function computes the n dimensional inverse FFT over any number of axes
+    in an M-dimensional of a signal that has Hermitian symmetry by means of an
     efficient algorithm called the Fast Fourier Transform (FFT).
 
     Args:
         x(Tensor): Input tensor.
-        s(Sequence[int], optional) : Shape (length along each transformed axis) 
-            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis 
-            1, etc.). Along any axis, if the given shape is smaller than that 
-            of the input, the input is cropped. If it is larger, the input is 
-            padded with zeros. if `s` is not given, the shape of the input 
+        s(Sequence[int], optional) : Shape (length along each transformed axis)
+            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis
+            1, etc.). Along any axis, if the given shape is smaller than that
+            of the input, the input is cropped. If it is larger, the input is
+            padded with zeros. if `s` is not given, the shape of the input
             along the axes specified by `axes` is used.
         axes(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
             given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor.
 
     Examples:
-    
+
     .. code-block:: python
-    
-        import paddle 
+
+        import paddle
 
         spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
         print(paddle.fft.ifft(spectrum))
@@ -877,22 +879,22 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output.
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``.
             Along each axis, if the given shape is smaller than that of the input,
             the input is cropped. If it is larger, the input is padded with zeros.
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a
+            sequence of 2 integers. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
         or the last two axes if `axes` is not given.
 
     Examples:
@@ -914,13 +916,17 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return fftn(x, s, axes, norm, name)
 
 
@@ -943,22 +949,22 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output.
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``.
             Along each axis, if the given shape is smaller than that of the input,
             the input is cropped. If it is larger, the input is padded with zeros.
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a
+            sequence of 2 integers. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
         or the last two axes if `axes` is not given.
 
     Examples:
@@ -979,13 +985,17 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return ifftn(x, s, axes, norm, name)
 
 
@@ -1000,28 +1010,28 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
         x(Tensor): Input tensor, taken to be real.
         s(Sequence[int], optional) : Shape of the FFT.
         axes(Sequence[int], optional): Axes over which to compute the FFT.
-        norm(str, optional) : {"backward", "ortho", "forward"}, 
-            default is "backward". Indicates which direction of the 
-            forward/backward pair of transforms is scaled and with what 
-            normalization factor. The details of 
+        norm(str, optional) : {"backward", "ortho", "forward"},
+            default is "backward". Indicates which direction of the
+            forward/backward pair of transforms is scaled and with what
+            normalization factor. The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
-    Returns: 
+    Returns:
         out(Tensor): The result of the real 2-D FFT.
 
     Examples:
 
     .. code-block:: python
-    
+
         import paddle
         import numpy as np
 
@@ -1038,13 +1048,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return rfftn(x, s, axes, norm, name)
 
 
@@ -1055,24 +1069,24 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type.
         s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
-        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
-            must be two-dimensional. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes
+            must be two-dimensional. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward". The details of 
+            pair and what normalization factor to use. The parameter value must be one
+            of "forward" or "backward" or "ortho". Default is "backward". The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
+
     Returns:
         Real tensor. The result of the inverse real 2-D FFT.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1090,13 +1104,17 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return irfftn(x, s, axes, norm, name)
 
 
@@ -1107,17 +1125,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type.
         s (sequence of ints, optional): Shape of the real output. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be 
-            two-dimensional. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be
+            two-dimensional. If not specified, the last two axes are used by default.
         norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
         Real tensor. The real result of the 2-D Hermitian complex real FFT.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1135,13 +1153,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return hfftn(x, s, axes, norm, name)
 
 
@@ -1155,13 +1177,13 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x(Tensor): Input tensor.
         s(Sequence[int], optional): Shape of the real input to the inverse FFT.
-        axes(Sequance[int], optional): The axes over which to compute the 
+        axes(Sequance[int], optional): The axes over which to compute the
             inverse fft. Default is the last two axes.
-        norm(str, optional): {"backward", "ortho", "forward"}. Default is 
+        norm(str, optional): {"backward", "ortho", "forward"}. Default is
             "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : The result of the inverse hermitian 2-D FFT.
@@ -1187,13 +1209,17 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return ihfftn(x, s, axes, norm, name)
 
 
@@ -1214,7 +1240,7 @@ def fftfreq(n, d=1.0, dtype=None, name=None):
     Args:
         n (int): Dimension inputed.
         d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1250,8 +1276,8 @@ def rfftfreq(n, d=1.0, dtype=None, name=None):
     """
     Return the Discrete Fourier Transform sample frequencies.
 
-    The returned floating-point array "F" contains the center of the frequency unit, 
-    and the unit is the number of cycles of the sampling interval (the starting point is zero). 
+    The returned floating-point array "F" contains the center of the frequency unit,
+    and the unit is the number of cycles of the sampling interval (the starting point is zero).
 
     Given input length `n` and a sample spacing `d`::
 
@@ -1263,9 +1289,9 @@ def rfftfreq(n, d=1.0, dtype=None, name=None):
     Args:
         n (int): Dimension inputed.
         d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        dtype (str, optional): The data type of returns. Defaults is the data type of returns 
+        dtype (str, optional): The data type of returns. Defaults is the data type of returns
             of ``paddle.get_default_dtype()``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1307,12 +1333,12 @@ def fftshift(x, axes=None, name=None):
         n (int): Dimension inputed.
         axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
             Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor. The shifted tensor.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1343,19 +1369,19 @@ def fftshift(x, axes=None, name=None):
 
 def ifftshift(x, axes=None, name=None):
     """
-    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the 
+    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the
     odd length 'x' is different. An example.
 
     Args:
         n (int): Dimension inputed.
         axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
             Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor. The shifted tensor.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1417,10 +1443,9 @@ def fft_c2c(x, n, axis, norm, forward, name):
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
 
 
@@ -1442,8 +1467,16 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
     if in_dygraph_mode():
         out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
     elif _in_legacy_dygraph():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                 'onesided', onesided)
+        attrs = (
+            'axes',
+            axes,
+            'normalization',
+            norm,
+            'forward',
+            forward,
+            'onesided',
+            onesided,
+        )
         out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
@@ -1458,12 +1491,12 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
+            _real_to_complex_dtype(dtype)
+        )
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
 
 
@@ -1491,8 +1524,16 @@ def fft_c2r(x, n, axis, norm, forward, name):
             out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
     elif _in_legacy_dygraph():
         if n is not None:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                     'last_dim_size', n)
+            attrs = (
+                'axes',
+                axes,
+                'normalization',
+                norm,
+                'forward',
+                forward,
+                'last_dim_size',
+                n,
+            )
         else:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_legacy_C_ops, op_type)(x, *attrs)
@@ -1506,12 +1547,12 @@ def fft_c2r(x, n, axis, norm, forward, name):
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
+            _complex_to_real_dtype(dtype)
+        )
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
 
 
@@ -1539,8 +1580,10 @@ def fftn_c2c(x, s, axes, norm, forward, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
+                        len(s), len(axes)
+                    )
+                )
             s = [s[i] for i in axes_argsoft]
 
     if s is not None:
@@ -1562,10 +1605,9 @@ def fftn_c2c(x, s, axes, norm, forward, name):
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
 
 
@@ -1591,8 +1633,10 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
+                        len(s), len(axes)
+                    )
+                )
             s = [s[i] for i in axes_argsoft] + [s[-1]]
 
     if s is not None:
@@ -1604,8 +1648,16 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
     if in_dygraph_mode():
         out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
     elif _in_legacy_dygraph():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                 'onesided', onesided)
+        attrs = (
+            'axes',
+            axes,
+            'normalization',
+            norm,
+            'forward',
+            forward,
+            'onesided',
+            onesided,
+        )
         out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
@@ -1620,12 +1672,12 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
+            _real_to_complex_dtype(dtype)
+        )
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
 
     return out
 
@@ -1654,8 +1706,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
+                        len(s), len(axes)
+                    )
+                )
             s = [s[i] for i in axes_argsoft] + [s[-1]]
 
     if s is not None:
@@ -1673,8 +1727,16 @@ def fftn_c2r(x, s, axes, norm, forward, name):
             out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
     elif _in_legacy_dygraph():
         if s:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                     'last_dim_size', s[-1])
+            attrs = (
+                'axes',
+                axes,
+                'normalization',
+                norm,
+                'forward',
+                forward,
+                'last_dim_size',
+                s[-1],
+            )
         else:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_legacy_C_ops, op_type)(x, *attrs)
@@ -1688,10 +1750,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
+            _complex_to_real_dtype(dtype)
+        )
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
index 8cd422d0d7691d59d74fba462465e950b7ce6454..b0b64f27eccc1ec5849f43ea1a57d584354e1527 100644
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -23,9 +23,9 @@ from ...log_helper import get_logger
 
 __all__ = ['add_supported_layer']
 
-_logger = get_logger(__name__,
-                     logging.INFO,
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)
 
 
 def _default_pruning(weight_nparray, m, n, func_name, param_name):
@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
     exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
     if exlude_cond_shape2:
         _logger.warning(
-            '{} is not pruned because the first dimension of {} is smaller than {}'
-            .format(param_name, shape, m))
+            '{} is not pruned because the first dimension of {} is smaller than {}'.format(
+                param_name, shape, m
+            )
+        )
         return weight_pruned_nparray, weight_sparse_mask
     if exlude_cond_shape4:
         _logger.warning(
-            '{} is not pruned because the second dimension of {} is smaller than {}'
-            .format(param_name, shape, m))
+            '{} is not pruned because the second dimension of {} is smaller than {}'.format(
+                param_name, shape, m
+            )
+        )
         return weight_pruned_nparray, weight_sparse_mask
 
     checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
     # sparsity/utils is row-major pruning. That is the reason we have to transpose weight
     # matrices beforce invoking create_mask. Then we transpose the result mask to make
     # sure its shape to be the same as the input weight.
-    weight_sparse_mask = sparsity.create_mask(weight_nparray.T,
-                                              func_name=func_name,
-                                              n=n,
-                                              m=m).T
+    weight_sparse_mask = sparsity.create_mask(
+        weight_nparray.T, func_name=func_name, n=n, m=m
+    ).T
     weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
-    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
-                    'Pruning {} weight matrix failure!!!'.format(param_name)
+    assert sparsity.check_sparsity(
+        weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name
+    ), 'Pruning {} weight matrix failure!!!'.format(param_name)
     return weight_pruned_nparray, weight_sparse_mask
 
 
@@ -78,28 +82,35 @@ supported_layers_and_prune_func_map = {}
 
 def add_supported_layer(layer, pruning_func=None):
     r"""
+
     Add supported layers and its corresponding pruning function.
 
     Args:
-        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then 
-        it would be turn to string internally. ASP would use this name to match parameter's name and call 
-        its the corresponding pruning function.
+        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then
+                             it would be turn to string internally. ASP would use this name to match parameter's name and call
+                             its the corresponding pruning function.
         pruning_func (function, optional): a function type which receives five argument (weight_nparray,
-        m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
-        m, n, and func_name, please see `prune_model` for details.
+                                           m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
+                                           m, n, and func_name, please see `prune_model` for details.
+
     """
     name = None
     if isinstance(layer, str):
         name = layer
     elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
         name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            type(layer).__name__)
+            type(layer).__name__
+        )
     elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
         name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            layer.__name__)
+            layer.__name__
+        )
     else:
-        assert "The type of layer should be string of Layer, but got {}!".format(
-            type(layer))
+        assert (
+            "The type of layer should be string of Layer, but got {}!".format(
+                type(layer)
+            )
+        )
     if pruning_func is None:
         pruning_func = _default_pruning
     _supported_layers_and_prune_func_map_lock.acquire()
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index 1d0694c4dde3cb68e201684e8d447afaa0e4f2d4..c6d706bd31e8effcad5e5c5a266577ed34b24f9e 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -27,9 +27,16 @@ from itertools import permutations
 import threading
 
 __all__ = [
-    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
-    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
-    'MaskAlgo', 'CheckMethod'
+    'calculate_density',
+    'check_mask_1d',
+    'get_mask_1d',
+    'check_mask_2d',
+    'get_mask_2d_greedy',
+    'get_mask_2d_best',
+    'create_mask',
+    'check_sparsity',
+    'MaskAlgo',
+    'CheckMethod',
 ]
 
 
@@ -76,8 +83,9 @@ class CheckMethod(Enum):
             CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
             # CheckMethod.CHECK_2D
         """
-        assert isinstance(mask_algo, MaskAlgo), \
-               "mask_algo should be MaskAlgo type"
+        assert isinstance(
+            mask_algo, MaskAlgo
+        ), "mask_algo should be MaskAlgo type"
         if mask_algo == MaskAlgo.MASK_1D:
             return CheckMethod.CHECK_1D
         else:
@@ -86,20 +94,25 @@ class CheckMethod(Enum):
 
 def calculate_density(x):
     r"""
+
     Return the density of the input tensor.
 
     Args:
         x (nparray): The input tensor.
+
     Returns:
-        float: The density of :attr:`x`.
+        float, The density of :attr:`x`.
+
     Examples:
         .. code-block:: python
-          import paddle
-          import numpy as np
 
-          x = np.array([[0, 1, 3, 0],
+            import paddle
+            import numpy as np
+
+            x = np.array([[0, 1, 3, 0],
                         [1, 1, 0, 1]])
-          paddle.incubate.asp.calculate_density(x) # 0.625
+            paddle.incubate.asp.calculate_density(x) # 0.625
+
     """
     x_flattened = x.flatten()
     return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
@@ -108,7 +121,7 @@ def calculate_density(x):
 def _reshape_1d(mat, m):
     r"""
     Reshape the input 2D matrix to shape (-1, m).
-    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, 
+    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`,
     then this function would pad the remainder with 0 before reshaping.
 
     .. math::
@@ -126,7 +139,7 @@ def _reshape_1d(mat, m):
     remainder = mat.shape[1] % m
     if mat.shape[1] % m > 0:
         mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
-        mat_padded[:, :mat.shape[1]] = mat
+        mat_padded[:, : mat.shape[1]] = mat
         shape = mat_padded.shape
         return mat_padded.reshape(-1, m), shape
     else:
@@ -136,7 +149,7 @@ def _reshape_1d(mat, m):
 def check_mask_1d(mat, n, m):
     r"""
     Check if every row of the input matrix :attr:`mat` is in 1D `n:m` sparse pattern.
-    This function would pad the second dimension of :attr:`mat` by zero 
+    This function would pad the second dimension of :attr:`mat` by zero
     to be a multiples of :attr:`m` if necessary.
 
     1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
@@ -179,8 +192,8 @@ def check_mask_1d(mat, n, m):
 
 def get_mask_1d(mat, n, m):
     r"""
-    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
-    in row-directory. This function would pad the second dimension of :attr:`mat` 
+    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat`
+    in row-directory. This function would pad the second dimension of :attr:`mat`
     by zero to be a multiples of :attr:`m` before mask generation.
 
     1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m):
         min_order_indices = np.argsort(np.absolute(sub_mat))
         mask_flattern[i, min_order_indices[:n].tolist()] = 0
     mask_flattern = mask_flattern.reshape(shape)
-    mask[:, :] = mask_flattern[:, :mat.shape[1]]
+    mask[:, :] = mask_flattern[:, : mat.shape[1]]
     return mask
 
 
@@ -239,12 +252,12 @@ def _reshape_2d(mat, m):
     remainder_0 = mat.shape[0] % m
     remainder_1 = mat.shape[1] % m
 
-    new_shape = (mat.shape[0] if remainder_0 == 0 \
-                 else mat.shape[0] + (m - remainder_0),
-                 mat.shape[1] if remainder_1 == 0 \
-                 else mat.shape[1] + (m - remainder_1))
+    new_shape = (
+        mat.shape[0] if remainder_0 == 0 else mat.shape[0] + (m - remainder_0),
+        mat.shape[1] if remainder_1 == 0 else mat.shape[1] + (m - remainder_1),
+    )
     mat_padded = np.zeros(new_shape)
-    mat_padded[:mat.shape[0], :mat.shape[1]] = mat
+    mat_padded[: mat.shape[0], : mat.shape[1]] = mat
 
     mat_flattern = np.empty(new_shape).reshape(-1, m * m)
     curr_idx = 0
@@ -252,9 +265,9 @@ def _reshape_2d(mat, m):
         row_end = row_start + m
         for col_start in range(0, mat_padded.shape[1], m):
             col_end = col_start + m
-            sub_mat = np.squeeze(mat_padded[row_start:row_end, \
-                                            col_start:col_end] \
-                                            .reshape(-1))
+            sub_mat = np.squeeze(
+                mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
+            )
             mat_flattern[curr_idx] = sub_mat
             curr_idx += 1
     return mat_flattern, mat_padded.shape
@@ -263,10 +276,10 @@ def _reshape_2d(mat, m):
 def check_mask_2d(mat, n, m):
     r"""
     Check if every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern.
-    This function would pad each dimension of :attr:`mat` by zero to be a multiples of 
+    This function would pad each dimension of :attr:`mat` by zero to be a multiples of
     :attr:`m` if necessary.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     Args:
@@ -304,18 +317,19 @@ def check_mask_2d(mat, n, m):
     mat_padded, shape = _reshape_2d(mat, m)
     for sub_mat in mat_padded:
         sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
-        if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
-            (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
+        if (np.sum(np.sum(sub_mask, axis=1) > (m - n)) != 0) and (
+            np.sum(np.sum(sub_mask, axis=0) > (m - n)) != 0
+        ):
             return False
     return True
 
 
 def get_mask_2d_greedy(mat, n, m):
     r"""
-    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`. 
+    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`.
     This function would pad each dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
     Greedily generating: For each :math:`m \times m` block, selecting values to keep in descent order.
 
@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m):
         sub_mask = np.squeeze(mask_padded[idx])
 
         min_order_1d_indices = np.argsort(sub_mat)
-        min_order_2d_indices = [(int(x / m), x % m)
-                                for x in min_order_1d_indices]
+        min_order_2d_indices = [
+            (int(x / m), x % m) for x in min_order_1d_indices
+        ]
         row_counter = collections.Counter()
         col_counter = collections.Counter()
 
         for i in range(len(min_order_1d_indices) - 1, -1, -1):
             matrix_entry = min_order_2d_indices[i]
-            if (row_counter[matrix_entry[0]] == n) or \
-               (col_counter[matrix_entry[1]] == n):
+            if (row_counter[matrix_entry[0]] == n) or (
+                col_counter[matrix_entry[1]] == n
+            ):
                 continue
 
             sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m):
             col_end = col_start + m
             mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
             curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 
 
 _valid_2d_patterns_lock = threading.Lock()
@@ -384,7 +400,7 @@ def _compute_valid_2d_patterns(n, m):
     r"""
     Compute all vaild 2D `n:m` sparse patterns.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     Args:
@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m):
         patterns = patterns + patterns
         patterns = np.asarray(list(set(permutations(patterns, m))))
 
-        valid = ((patterns.sum(axis=1) <= n).sum(
-            axis=1) == m).nonzero()[0].reshape(-1)
+        valid = (
+            ((patterns.sum(axis=1) <= n).sum(axis=1) == m)
+            .nonzero()[0]
+            .reshape(-1)
+        )
         valid_patterns = np.empty((valid.shape[0], m, m))
         valid_patterns[:] = patterns[valid[:]]
 
@@ -420,11 +439,11 @@ def _compute_valid_2d_patterns(n, m):
 
 def get_mask_2d_best(mat, n, m):
     r"""
-    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
-    to form sparse matrix with maximun L1 norm .This function would pad each 
+    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`
+    to form sparse matrix with maximun L1 norm .This function would pad each
     dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     *Note*: L1 norm of sparse matrix from `Best` API is greater than or equal to the one from `Greedy`.
@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m):
 
     mat_flattern, shape = _reshape_2d(mat, m)
     mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
-    pmax = np.argmax(np.matmul(mat_flattern,
-                               patterns.reshape(patterns.shape[0], m * m).T),
-                     axis=1)
+    pmax = np.argmax(
+        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
+        axis=1,
+    )
 
     mask_flattern[:] = patterns[pmax[:]]
     mask = np.empty(shape)
@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m):
             col_end = col_start + m
             mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
             curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 
 
 def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
     dtype = tensor.dtype
     t = tensor.astype(float)
 
-    assert isinstance(func_name, MaskAlgo), \
-           "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
-           "But got {}".format(type(func_name))
+    assert isinstance(func_name, MaskAlgo), (
+        "func_name argumet of create_mask is only accepted as type MaskAlgo. "
+        "But got {}".format(type(func_name))
+    )
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
         t = t.reshape(1, shape[0])
@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
         t = t.reshape(shape[0] * shape[1], shape[2])
     # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
     elif len(shape) == 4:
-        t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3],
-                                              shape[2])
+        t = t.transpose([0, 1, 3, 2]).reshape(
+            shape[0] * shape[1] * shape[3], shape[2]
+        )
         mask = func(t, n=n, m=m)
-        return mask.reshape([shape[0], shape[1], shape[3],
-                             shape[2]]).transpose([0, 1, 3, 2]).astype(dtype)
+        return (
+            mask.reshape([shape[0], shape[1], shape[3], shape[2]])
+            .transpose([0, 1, 3, 2])
+            .astype(dtype)
+        )
     else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+        raise ValueError(
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
 
     mask = func(t, n=n, m=m)
     return mask.reshape(shape).astype(dtype)
@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
     shape = tensor.shape
     t = tensor.astype(float)
 
-    assert type(func_name) == CheckMethod, \
-           "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
-           "But got {}".format(type(func_name))
+    assert type(func_name) == CheckMethod, (
+        "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
+        "But got {}".format(type(func_name))
+    )
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
         t = t.reshape(1, shape[0])
@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
         t = t.reshape(shape[0] * shape[1], shape[2])
     # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
     elif len(shape) == 4:
-        t = t.transpose([0, 1, 3,
-                         2]).reshape([shape[0] * shape[1] * shape[3], shape[2]])
+        t = t.transpose([0, 1, 3, 2]).reshape(
+            [shape[0] * shape[1] * shape[3], shape[2]]
+        )
     else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+        raise ValueError(
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
 
     return func(t, n=n, m=m)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 879900085d57e32a2858674799a33976ab8cab66..d21db965a54d0fcfb01c75e3ff58d0acb4d0d62c 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -32,12 +32,25 @@ from . import parallel_helper
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
-from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
-from .base import program_desc_tracing_guard, param_guard, in_declarative_mode, _convert_into_variable
+from .layer_hooks import (
+    record_program_ops_pre_hook,
+    set_op_customized_attrs_post_hook,
+    LayerOpsRecoder,
+)
+from .base import (
+    program_desc_tracing_guard,
+    param_guard,
+    in_declarative_mode,
+    _convert_into_variable,
+)
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
-from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_, in_dygraph_mode
+from paddle.fluid.framework import (
+    _non_static_mode,
+    convert_np_dtype_to_dtype_,
+    in_dygraph_mode,
+)
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.core import VarDesc
@@ -67,7 +80,7 @@ def _addindent(string, indent):
 
 
 class HookRemoveHelper(object):
-    """ A HookRemoveHelper that can be used to remove hook. """
+    """A HookRemoveHelper that can be used to remove hook."""
 
     next_hook_id = 0
 
@@ -153,13 +166,14 @@ class Layer(object):
 
     def train(self):
         """
+
         Sets this Layer and all its sublayers to training mode.
         This only effects certain modules like `Dropout` and `BatchNorm`.
 
         Returns:
             None
 
-        Example::
+        Examples:
             .. code-block:: python
 
                 import paddle
@@ -236,6 +250,7 @@ class Layer(object):
 
     def apply(self, fn):
         """
+
         Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``)
         as well as self. Typical use includes initializing the parameters of a model.
 
@@ -243,7 +258,7 @@ class Layer(object):
             fn (function): a function to be applied to each sublayer
 
         Returns:
-            Layer: self
+            Layer, self
 
         Example::
             .. code-block:: python
@@ -263,6 +278,7 @@ class Layer(object):
               net.apply(init_weights)
 
               print(net.state_dict())
+
         """
         for layer in self.children():
             layer.apply(fn)
@@ -272,10 +288,12 @@ class Layer(object):
         return self
 
     def full_name(self):
-        """Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
+        """
+
+        Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
 
         Returns:
-            str: full name of this layer.
+            str, full name of this layer.
 
         Example::
             .. code-block:: python
@@ -297,7 +315,9 @@ class Layer(object):
         return self._full_name
 
     def register_forward_post_hook(self, hook):
-        """Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
+        """
+
+        Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
 
         It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively.
         User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer.
@@ -308,7 +328,7 @@ class Layer(object):
             hook(function): a function registered as a forward post-hook
 
         Returns:
-            HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
+            HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
 
         Examples:
             .. code-block:: python
@@ -340,13 +360,16 @@ class Layer(object):
 
                 # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
                 assert (out0.numpy() == (out1.numpy()) * 2).any()
+
         """
         hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
         self._forward_post_hooks[hook_remove_helper._hook_id] = hook
         return hook_remove_helper
 
     def register_forward_pre_hook(self, hook):
-        """Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
+        """
+
+        Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
 
         It should have the following form, `input` of the `hook` is `input` of the `Layer`,
         hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if
@@ -359,7 +382,7 @@ class Layer(object):
             hook(function): a function registered as a forward pre-hook
 
         Returns:
-            HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
+            HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
 
         Examples:
             .. code-block:: python
@@ -398,12 +421,14 @@ class Layer(object):
         self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
         return hook_remove_helper
 
-    def create_parameter(self,
-                         shape,
-                         attr=None,
-                         dtype=None,
-                         is_bias=False,
-                         default_initializer=None):
+    def create_parameter(
+        self,
+        shape,
+        attr=None,
+        dtype=None,
+        is_bias=False,
+        default_initializer=None,
+    ):
         """Create parameters for this layer.
 
         Parameters:
@@ -443,12 +468,15 @@ class Layer(object):
         temp_attr = copy.deepcopy(attr)
         if isinstance(temp_attr, six.string_types) and temp_attr == "":
             temp_attr = None
-        return self._helper.create_parameter(temp_attr, shape, dtype, is_bias,
-                                             default_initializer)
-
-    @deprecated(since="2.0.0",
-                update_to="paddle.nn.Layer.create_tensor",
-                reason="New api in create_tensor, easier to use.")
+        return self._helper.create_parameter(
+            temp_attr, shape, dtype, is_bias, default_initializer
+        )
+
+    @deprecated(
+        since="2.0.0",
+        update_to="paddle.nn.Layer.create_tensor",
+        reason="New api in create_tensor, easier to use.",
+    )
     def create_variable(self, name=None, persistable=None, dtype=None):
         """
 
@@ -488,14 +516,16 @@ class Layer(object):
         if name is not None:
             var_name = ".".join([self._full_name, name])
         else:
-            var_name = unique_name.generate(".".join(
-                [self._full_name, "_generated_var"]))
+            var_name = unique_name.generate(
+                ".".join([self._full_name, "_generated_var"])
+            )
 
         return self._helper.main_program.current_block().create_var(
             name=var_name,
             persistable=persistable,
             dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )
 
     # TODO: Add more parameter list when we need them
     def create_tensor(self, name=None, persistable=None, dtype=None):
@@ -538,38 +568,46 @@ class Layer(object):
         if name is not None:
             var_name = ".".join([self._full_name, name])
         else:
-            var_name = unique_name.generate(".".join(
-                [self._full_name, "_generated_var"]))
+            var_name = unique_name.generate(
+                ".".join([self._full_name, "_generated_var"])
+            )
 
         return self._helper.main_program.current_block().create_var(
             name=var_name,
             persistable=persistable,
             dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )
 
     def parameters(self, include_sublayers=True):
-        """Returns a list of all Parameters from current layer and its sub-layers.
+        """
+
+        Returns a list of all Parameters from current layer and its sub-layers.
 
         Returns:
-            list of Tensor : a list of Parameters.
+            list of Tensor, a list of Parameters.
 
         Examples:
             .. code-block:: python
 
-            import paddle
+                import paddle
 
-            linear = paddle.nn.Linear(1,1)
-            print(linear.parameters())  # print linear_0.w_0 and linear_0.b_0
+                linear = paddle.nn.Linear(1,1)
+                print(linear.parameters())  # print linear_0.w_0 and linear_0.b_0
 
         """
         ret = [
-            param for _, param in self.named_parameters(
-                include_sublayers=include_sublayers)
+            param
+            for _, param in self.named_parameters(
+                include_sublayers=include_sublayers
+            )
         ]
         return ret
 
     def children(self):
-        """Returns an iterator over immediate children layers.
+        """
+
+        Returns an iterator over immediate children layers.
 
         Yields:
             Layer: a child layer
@@ -619,13 +657,15 @@ class Layer(object):
                 yield name, layer
 
     def sublayers(self, include_self=False):
-        """Returns a list of sub layers.
+        """
+
+        Returns a list of sub layers.
 
         Parameters:
             include_self(bool, optional): Whether return self as sublayers. Default: False
 
         Returns:
-            list of Layer : a list of sub layers.
+            list of Layer, a list of sub layers.
 
         Examples:
             .. code-block:: python
@@ -678,9 +718,11 @@ class Layer(object):
 
         """
         params_set = set()
-        named_sublayers = self.named_sublayers(
-            prefix=prefix, include_self=True) if include_sublayers else zip(
-                [prefix], [self])
+        named_sublayers = (
+            self.named_sublayers(prefix=prefix, include_self=True)
+            if include_sublayers
+            else zip([prefix], [self])
+        )
         for layer_prefix, sublayer in named_sublayers:
             params = sublayer._parameters.items()
             for key, param in params:
@@ -724,9 +766,9 @@ class Layer(object):
             if layer is None:
                 continue
             layer_prefix = prefix + ('.' if prefix else '') + key
-            for p, l in layer.named_sublayers(prefix=layer_prefix,
-                                              include_self=True,
-                                              layers_set=layers_set):
+            for p, l in layer.named_sublayers(
+                prefix=layer_prefix, include_self=True, layers_set=layers_set
+            ):
                 yield p, l
 
     def register_buffer(self, name, tensor, persistable=True):
@@ -769,25 +811,32 @@ class Layer(object):
 
         if '_buffers' not in self.__dict__:
             raise ValueError(
-                "super(YourLayer, self).__init__() should be called first")
+                "super(YourLayer, self).__init__() should be called first"
+            )
         elif not isinstance(name, six.string_types):
             raise TypeError(
-                "The name of buffer should be a string, but received {}.".
-                format(type(name).__name__))
+                "The name of buffer should be a string, but received {}.".format(
+                    type(name).__name__
+                )
+            )
         elif '.' in name:
             raise KeyError(
                 "The name of buffer can not contain `.`, "
                 "because when you access the newly added buffer in the "
-                "form of `self.**.**`, it will cause AttributeError.")
+                "form of `self.**.**`, it will cause AttributeError."
+            )
         elif name == '':
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not (type(tensor) == core.VarBase
-                                         or type(tensor) == core.eager.Tensor):
+        elif tensor is not None and not (
+            type(tensor) == core.VarBase or type(tensor) == core.eager.Tensor
+        ):
             raise TypeError(
-                "The registered buffer should be a Paddle.Tensor, but received {}."
-                .format(type(tensor).__name__))
+                "The registered buffer should be a Paddle.Tensor, but received {}.".format(
+                    type(tensor).__name__
+                )
+            )
         else:
             self._buffers[name] = tensor
             if persistable:
@@ -797,13 +846,14 @@ class Layer(object):
 
     def buffers(self, include_sublayers=True):
         """
+
         Returns a list of all buffers from current layer and its sub-layers.
 
         Parameters:
             include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True
 
         Returns:
-            list of Tensor : a list of buffers.
+            list of Tensor, a list of buffers.
 
         Examples:
             .. code-block:: python
@@ -820,8 +870,10 @@ class Layer(object):
 
         """
         ret = [
-            buffer for _, buffer in self.named_buffers(
-                include_sublayers=include_sublayers)
+            buffer
+            for _, buffer in self.named_buffers(
+                include_sublayers=include_sublayers
+            )
         ]
         return ret
 
@@ -862,9 +914,11 @@ class Layer(object):
 
         """
         buffers_set = set()
-        named_sublayers = self.named_sublayers(
-            prefix=prefix, include_self=True) if include_sublayers else zip(
-                [prefix], [self])
+        named_sublayers = (
+            self.named_sublayers(prefix=prefix, include_self=True)
+            if include_sublayers
+            else zip([prefix], [self])
+        )
         for layer_prefix, sublayer in named_sublayers:
             buffers = sublayer._buffers.items()
             for key, buffer in buffers:
@@ -910,7 +964,7 @@ class Layer(object):
             hook_result = forward_pre_hook(self, inputs)
             if hook_result is not None:
                 if not isinstance(hook_result, tuple):
-                    hook_result = (hook_result, )
+                    hook_result = (hook_result,)
                 inputs = hook_result
 
         if not self._built:
@@ -920,16 +974,20 @@ class Layer(object):
                 # TODO(liuyuhui) Only xpu broadcast parameters here.
                 # The other device is to call _sync_params_buffers in DataParallel
                 # to realize the parameter synchronization among multiply cards.
-                if parallel_helper._is_data_parallel_mode(
-                ) and paddle.is_compiled_with_xpu():
+                if (
+                    parallel_helper._is_data_parallel_mode()
+                    and paddle.is_compiled_with_xpu()
+                ):
                     parallel_helper._broadcast_parameters(
-                        self._parameters.values())
+                        self._parameters.values()
+                    )
 
             self._built = True
 
         if in_profiler_mode():
-            with profiler.RecordEvent(self.__class__.__name__,
-                                      profiler.TracerEventType.Forward):
+            with profiler.RecordEvent(
+                self.__class__.__name__, profiler.TracerEventType.Forward
+            ):
                 outputs = self.forward(*inputs, **kwargs)
         else:
             outputs = self.forward(*inputs, **kwargs)
@@ -942,8 +1000,14 @@ class Layer(object):
         return outputs
 
     def __call__(self, *inputs, **kwargs):
-        if (not in_declarative_mode()) and (not self._forward_pre_hooks) \
-            and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode() and (not in_profiler_mode()):
+        if (
+            (not in_declarative_mode())
+            and (not self._forward_pre_hooks)
+            and (not self._forward_post_hooks)
+            and (not self._built)
+            and in_dygraph_mode()
+            and (not in_profiler_mode())
+        ):
             self._build_once(*inputs, **kwargs)
             return self.forward(*inputs, **kwargs)
         else:
@@ -964,7 +1028,9 @@ class Layer(object):
         raise ValueError("Layer shouldn't implement backward")
 
     def add_sublayer(self, name, sublayer):
-        """Adds a sub Layer instance.
+        """
+
+        Adds a sub Layer instance.
 
         Added sublayer can be accessed by self.name
 
@@ -972,7 +1038,7 @@ class Layer(object):
             name(str): name of this sublayer.
             sublayer(Layer): an instance of Layer.
         Returns:
-            Layer: the sublayer passed in.
+            Layer, the sublayer passed in.
 
         Examples:
             .. code-block:: python
@@ -999,8 +1065,9 @@ class Layer(object):
                 model = MySequential(fc1, fc2)
                 for prefix, layer in model.named_sublayers():
                     print(prefix, layer)
+
         """
-        assert (isinstance(sublayer, Layer) or sublayer == None)
+        assert isinstance(sublayer, Layer) or sublayer == None
 
         self._sub_layers[name] = sublayer
         return sublayer
@@ -1014,7 +1081,7 @@ class Layer(object):
             name(str): name of this sublayer.
             parameter(Parameter): an instance of Parameter.
         Returns:
-            Parameter: the parameter passed in.
+            Parameter, the parameter passed in.
         Examples:
             .. code-block:: python
 
@@ -1037,32 +1104,42 @@ class Layer(object):
         """
         if '_parameters' not in self.__dict__:
             raise RuntimeError(
-                "super(YourLayer, self).__init__() should be called firstly.")
+                "super(YourLayer, self).__init__() should be called firstly."
+            )
         elif not isinstance(name, six.string_types):
             raise TypeError(
-                "The name of parameter should be a string, but received {}.".
-                format(type(name).__name__))
+                "The name of parameter should be a string, but received {}.".format(
+                    type(name).__name__
+                )
+            )
         elif '.' in name:
             raise KeyError(
                 "The name of parameter can not contain `.`, "
                 "because when you access the newly added parameter in the "
-                "form of `self.**.**`, it will cause AttributeError.")
+                "form of `self.**.**`, it will cause AttributeError."
+            )
         elif name == '':
             raise KeyError("The name of parameter can not be empty.")
         elif hasattr(self, name) and name not in self._parameters:
             raise KeyError("The parameter '{}' already exists.".format(name))
-        elif parameter is not None and not isinstance(parameter,
-                                                      framework.Parameter):
+        elif parameter is not None and not isinstance(
+            parameter, framework.Parameter
+        ):
             raise TypeError(
-                "The parameter to be added should be a Parameter, but received {}."
-                .format(type(parameter).__name__))
+                "The parameter to be added should be a Parameter, but received {}.".format(
+                    type(parameter).__name__
+                )
+            )
         else:
             if parameter is None:
                 self._parameters[name] = None
 
             if len(self._loaddict_holder) > 0:
-                assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
-                    parameter.name)
+                assert (
+                    parameter.name in self._loaddict_holder
+                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    parameter.name
+                )
 
                 parameter.set_value(self._loaddict_holder[parameter.name])
 
@@ -1081,37 +1158,50 @@ class Layer(object):
         """
 
         def is_already_registered(is_pre_hook):
-            layers_hooks = self._forward_pre_hooks if is_pre_hook else self._forward_post_hooks
-            candidate_hook = record_program_ops_pre_hook if is_pre_hook else set_op_customized_attrs_post_hook
+            layers_hooks = (
+                self._forward_pre_hooks
+                if is_pre_hook
+                else self._forward_post_hooks
+            )
+            candidate_hook = (
+                record_program_ops_pre_hook
+                if is_pre_hook
+                else set_op_customized_attrs_post_hook
+            )
 
             already_registed = False
             if layers_hooks:
                 last_key = next(reversed(layers_hooks))
-                already_registed = (layers_hooks[last_key] == candidate_hook)
+                already_registed = layers_hooks[last_key] == candidate_hook
 
             return already_registed
 
         if not isinstance(attrs, dict):
             raise TypeError(
                 "attrs should be type(dict), but received {}".format(
-                    type(attrs).__name__))
+                    type(attrs).__name__
+                )
+            )
 
         # NOTE: Overwrite behavior for same key.
         self._customized_attrs.update(attrs)
 
         if not is_already_registered(is_pre_hook=True):
             pre_hook_helper = self.register_forward_pre_hook(
-                record_program_ops_pre_hook)
+                record_program_ops_pre_hook
+            )
             assert len(self._op_recorder.hooks) == 0
             self._op_recorder.hooks = [pre_hook_helper]
 
         # manually register post_hook to ensure it is inserted into the head.
         if not is_already_registered(is_pre_hook=False):
             post_hook_helper = self.register_forward_post_hook(
-                set_op_customized_attrs_post_hook)
+                set_op_customized_attrs_post_hook
+            )
             if len(self._forward_post_hooks) > 1:
-                self._forward_post_hooks.move_to_end(post_hook_helper._hook_id,
-                                                     last=False)
+                self._forward_post_hooks.move_to_end(
+                    post_hook_helper._hook_id, last=False
+                )
 
             assert len(self._op_recorder.hooks) == 1
 
@@ -1144,7 +1234,6 @@ class Layer(object):
         return object.__getattribute__(self, name)
 
     def __setattr__(self, name, value):
-
         def _remove_if_exist(*dicts):
             for d in dicts:
                 if name in d:
@@ -1156,10 +1245,14 @@ class Layer(object):
         if isinstance(value, framework.Parameter):
             if params is None:
                 raise ValueError(
-                    "super(YourLayer, self).__init__() should be called first")
+                    "super(YourLayer, self).__init__() should be called first"
+                )
             if len(self._loaddict_holder) > 0:
-                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
-                    value.name)
+                assert (
+                    value.name in self._loaddict_holder
+                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    value.name
+                )
 
                 value.set_value(self._loaddict_holder[value.name])
 
@@ -1168,9 +1261,10 @@ class Layer(object):
         elif params is not None and name in params:
             if value is not None:
                 raise TypeError(
-                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'"
-                    .format(name,
-                            type(value).__name__))
+                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'".format(
+                        name, type(value).__name__
+                    )
+                )
             params[name] = None
         else:
             layers = self.__dict__.get('_sub_layers', None)
@@ -1185,9 +1279,10 @@ class Layer(object):
             elif layers is not None and name in layers:
                 if value is not None:
                     raise TypeError(
-                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'"
-                        .format(name,
-                                type(value).__name__))
+                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'".format(
+                            name, type(value).__name__
+                        )
+                    )
                 layers[name] = None
             else:
                 _buffers = self.__dict__.get('_buffers', None)
@@ -1196,8 +1291,9 @@ class Layer(object):
                         raise ValueError(
                             "super(YourLayer, self).__init__() should be called first"
                         )
-                    _remove_if_exist(self.__dict__, self._parameters,
-                                     self._sub_layers)
+                    _remove_if_exist(
+                        self.__dict__, self._parameters, self._sub_layers
+                    )
                     # Set persistable=False by default. Only `register_buffer` can
                     # add a persistable buffer.
                     if name not in self._buffers:
@@ -1211,6 +1307,7 @@ class Layer(object):
                     # value via `assign`.
                     if type(value) == framework.Variable:
                         from paddle import assign
+
                         # Note(zhhsplendid): the condition below happens in PaddleGan model,
                         # but should all non-Variable _buffers[name] be re-assign? We
                         # should consider it in the future. I current wrote this as
@@ -1218,18 +1315,23 @@ class Layer(object):
                         if in_declarative_mode() and _buffers[name] is None:
                             raise RuntimeError(
                                 'In Dy2stat, self.{0} is a buffer and self.{0} is '
-                                'not allowed to be set to Variable when self.{0} is None.'
-                                .format(name))
-                        elif _buffers[name] is None or type(getattr(
-                                self, name)) == core.VarBase:
+                                'not allowed to be set to Variable when self.{0} is None.'.format(
+                                    name
+                                )
+                            )
+                        elif (
+                            _buffers[name] is None
+                            or type(getattr(self, name)) == core.VarBase
+                        ):
                             _buffers[name] = assign(value)
                         else:
                             assign(value, getattr(self, name))
                     elif value is not None:
                         raise TypeError(
-                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
-                            .format(name,
-                                    type(value).__name__))
+                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'".format(
+                                name, type(value).__name__
+                            )
+                        )
                     else:
                         # Assigning None will remove the buffer, but if re-assign a new varBase to it,
                         # it will be remarked as a buffer with same `persistable` attribute.
@@ -1316,12 +1418,14 @@ class Layer(object):
         self._state_dict_hooks[hook_remove_helper._hook_id] = hook
         return hook_remove_helper
 
-    def _obtain_parameters_buffers(self,
-                                   destination=None,
-                                   include_sublayers=True,
-                                   structured_name_prefix=""):
+    def _obtain_parameters_buffers(
+        self,
+        destination=None,
+        include_sublayers=True,
+        structured_name_prefix="",
+    ):
         """
-        The difference from state_dict() is that state_dict_hook will not be called, 
+        The difference from state_dict() is that state_dict_hook will not be called,
         but the original types of parameters and buffers will be maintained.
         """
         if destination is None:
@@ -1330,7 +1434,10 @@ class Layer(object):
             if data is not None:
                 destination[structured_name_prefix + name] = data
         for name, buffer in self._buffers.items():
-            if buffer is not None and name not in self._non_persistable_buffer_names_set:
+            if (
+                buffer is not None
+                and name not in self._non_persistable_buffer_names_set
+            ):
                 destination[structured_name_prefix + name] = buffer
 
         if include_sublayers:
@@ -1339,17 +1446,22 @@ class Layer(object):
                     destination_temp = destination.copy()
                     destination_temp.update(
                         layer_item._obtain_parameters_buffers(
-                            destination_temp, include_sublayers,
-                            structured_name_prefix + layer_name + "."))
+                            destination_temp,
+                            include_sublayers,
+                            structured_name_prefix + layer_name + ".",
+                        )
+                    )
                     destination = destination_temp
         return destination
 
-    def _state_dict_impl(self,
-                         destination=None,
-                         include_sublayers=True,
-                         structured_name_prefix="",
-                         include_non_persistable_buffer=False,
-                         use_hook=True):
+    def _state_dict_impl(
+        self,
+        destination=None,
+        include_sublayers=True,
+        structured_name_prefix="",
+        include_non_persistable_buffer=False,
+        use_hook=True,
+    ):
         """
         Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
 
@@ -1367,7 +1479,10 @@ class Layer(object):
                 destination[structured_name_prefix + name] = data
         for name, buffer in self._buffers.items():
             if not include_non_persistable_buffer:
-                if buffer is not None and name not in self._non_persistable_buffer_names_set:
+                if (
+                    buffer is not None
+                    and name not in self._non_persistable_buffer_names_set
+                ):
                     destination[structured_name_prefix + name] = buffer
             else:
                 if buffer is not None:
@@ -1379,9 +1494,13 @@ class Layer(object):
                     destination_temp = destination.copy()
                     destination_temp.update(
                         layer_item._state_dict_impl(
-                            destination_temp, include_sublayers,
+                            destination_temp,
+                            include_sublayers,
                             structured_name_prefix + layer_name + ".",
-                            include_non_persistable_buffer, use_hook))
+                            include_non_persistable_buffer,
+                            use_hook,
+                        )
+                    )
                     destination = destination_temp
         if use_hook:
             for state_dict_hook in self._state_dict_hooks.values():
@@ -1391,12 +1510,15 @@ class Layer(object):
 
         return destination
 
-    def to_static_state_dict(self,
-                             destination=None,
-                             include_sublayers=True,
-                             structured_name_prefix="",
-                             use_hook=True):
+    def to_static_state_dict(
+        self,
+        destination=None,
+        include_sublayers=True,
+        structured_name_prefix="",
+        use_hook=True,
+    ):
         '''
+
         Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
 
         Parameters:
@@ -1405,7 +1527,7 @@ class Layer(object):
             use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True
 
         Retruns:
-            dict: a dict contains all the parameters and persistable buffers.
+            dict, a dict contains all the parameters and persistable buffers.
 
         Examples:
             .. code-block:: python
@@ -1423,13 +1545,16 @@ class Layer(object):
             include_sublayers=include_sublayers,
             structured_name_prefix=structured_name_prefix,
             include_non_persistable_buffer=True,
-            use_hook=use_hook)
-
-    def state_dict(self,
-                   destination=None,
-                   include_sublayers=True,
-                   structured_name_prefix="",
-                   use_hook=True):
+            use_hook=use_hook,
+        )
+
+    def state_dict(
+        self,
+        destination=None,
+        include_sublayers=True,
+        structured_name_prefix="",
+        use_hook=True,
+    ):
         '''
         Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
 
@@ -1457,7 +1582,8 @@ class Layer(object):
             include_sublayers=include_sublayers,
             structured_name_prefix=structured_name_prefix,
             include_non_persistable_buffer=False,
-            use_hook=use_hook)
+            use_hook=use_hook,
+        )
 
     @framework.deprecate_stat_dict
     def set_state_dict(self, state_dict, use_structured_name=True):
@@ -1489,22 +1615,31 @@ class Layer(object):
             state = state_dict.get(key, None)
             if state is None:
                 raise ValueError(
-                    "{} is not found in the provided dict.".format(key))
-            if (isinstance(state, dict) or isinstance(state, list)):
-                if (len(state) != len(param)):
-                    raise ValueError("{} receieves the length of {}, "
-                                     "but the expected shape is {}".format(
-                                         key, len(state), len(param)))
+                    "{} is not found in the provided dict.".format(key)
+                )
+            if isinstance(state, dict) or isinstance(state, list):
+                if len(state) != len(param):
+                    raise ValueError(
+                        "{} receieves the length of {}, "
+                        "but the expected shape is {}".format(
+                            key, len(state), len(param)
+                        )
+                    )
                 else:
                     return param, state
             else:
-                state_shape = state.shape() if inspect.ismethod(
-                    state.shape) else state.shape
+                state_shape = (
+                    state.shape()
+                    if inspect.ismethod(state.shape)
+                    else state.shape
+                )
 
                 if list(state_shape) != list(param.shape):
                     raise ValueError(
-                        "{} receives a shape {}, but the expected shape is {}.".
-                        format(key, list(state_shape), list(param.shape)))
+                        "{} receives a shape {}, but the expected shape is {}.".format(
+                            key, list(state_shape), list(param.shape)
+                        )
+                    )
                 return param, state
 
         matched_param_state = []
@@ -1541,8 +1676,10 @@ class Layer(object):
             executor = Executor(_get_device())._default_executor
             # restore parameter states
             core._create_loaded_parameter(
-                [param for param, state in matched_param_state], global_scope(),
-                executor)
+                [param for param, state in matched_param_state],
+                global_scope(),
+                executor,
+            )
             for param, state in matched_param_state:
                 _set_var(param, state)
 
@@ -1559,7 +1696,7 @@ class Layer(object):
 
             blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
-            
+
         Returns:
             self
 
@@ -1594,11 +1731,13 @@ class Layer(object):
                 #        [ 0.33960250,  0.96878713]])
 
         '''
-        return self._to_impl(device=device,
-                             dtype=dtype,
-                             blocking=blocking,
-                             include_sublayers=True,
-                             floating_only=False)
+        return self._to_impl(
+            device=device,
+            dtype=dtype,
+            blocking=blocking,
+            include_sublayers=True,
+            floating_only=False,
+        )
 
     def _apply(self, func, device, dtype, blocking, include_sublayers=True):
         if include_sublayers:
@@ -1612,8 +1751,9 @@ class Layer(object):
 
                 if param.grad is not None:
                     with no_grad():
-                        grad_applied = func(param._grad_ivar(), device, dtype,
-                                            blocking)
+                        grad_applied = func(
+                            param._grad_ivar(), device, dtype, blocking
+                        )
 
         for key, buf in self._buffers.items():
             if buf is not None:
@@ -1637,12 +1777,14 @@ class Layer(object):
             # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
             # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
             waiting_alloc_memory = (
-                (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+                ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            )
             gpu_memory_available = core.gpu_memory_available()
             if gpu_memory_available < waiting_alloc_memory:
                 # Copy param / Tensor to cpu
-                t_used = t._copy_to(paddle.CPUPlace(),
-                                    blocking)  # k-v type will error
+                t_used = t._copy_to(
+                    paddle.CPUPlace(), blocking
+                )  # k-v type will error
                 # Release mem of t
                 t.value().get_tensor()._clear()
             else:
@@ -1653,7 +1795,8 @@ class Layer(object):
         # 2. cast param / Tensor to dtype
         if dtype is not None and dtype != t_used.dtype:
             with paddle.fluid.framework._dygraph_place_guard(
-                    place=t_used.place):
+                place=t_used.place
+            ):
                 t_casted = t_used.cast(dtype=dtype)
         else:
             t_casted = t_used
@@ -1671,12 +1814,14 @@ class Layer(object):
 
         return t
 
-    def _to_impl(self,
-                 device=None,
-                 dtype=None,
-                 blocking=None,
-                 include_sublayers=True,
-                 floating_only=False):
+    def _to_impl(
+        self,
+        device=None,
+        dtype=None,
+        blocking=None,
+        include_sublayers=True,
+        floating_only=False,
+    ):
         '''
         Cast the parameters and buffers of Layer by the give device, dtype and blocking.
 
@@ -1689,7 +1834,7 @@ class Layer(object):
 
             blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
-            
+
             include_sublayers(bool|True, optional): If True, deal with self and all sublayers parameters and buffers, if not only deal with self parameters and buffers. Default: True.
 
             floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking.
@@ -1705,20 +1850,28 @@ class Layer(object):
         if device is not None:
             if isinstance(device, str):
                 device = paddle.device._convert_to_place(device)
-            elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
-                                     core.CUDAPinnedPlace, core.XPUPlace)):
+            elif isinstance(
+                device,
+                (
+                    core.CPUPlace,
+                    core.CUDAPlace,
+                    core.CUDAPinnedPlace,
+                    core.XPUPlace,
+                ),
+            ):
                 pass
             else:
                 raise ValueError(
                     "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
-                    + type(device).__name__)
+                    + type(device).__name__
+                )
 
         if blocking is None:
             blocking = True
         else:
             assert isinstance(
-                blocking,
-                bool), "blocking value error, must be the True, False or None"
+                blocking, bool
+            ), "blocking value error, must be the True, False or None"
 
         def transform(t, device, dtype, blocking):
             if floating_only and (not paddle.is_floating_point(t)):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 2936c532b01422f227409e9f30567cdfb0f9ab96..4fbcdc78536acfba7685182b9be6b56d59583dc0 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1352,12 +1352,13 @@ class ParameterMetaClass(VariableMetaClass):
 @six.add_metaclass(VariableMetaClass)
 class Variable(object):
     """
-    **Notes**:
-        **The constructor of Variable should not be invoked directly.**
 
-        **In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.**
+    Notes:
+        The constructor of Variable should not be invoked directly.
+
+        In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.
 
-        **In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data**
+        In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data.
 
     In Fluid, every input and output of an OP is a variable. In most
     cases, variables are used for holding different kinds of data or training
@@ -1514,12 +1515,13 @@ class Variable(object):
 
     def detach(self):
         """
+
         Returns a new Variable, detached from the current graph.
         It will share data with origin Variable and without tensor copy.
         In addition, the detached Variable doesn't provide gradient propagation.
 
         Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable.
 
         Examples:
             .. code-block:: python
@@ -1533,6 +1535,7 @@ class Variable(object):
 
                 # create a detached Variable
                 y = x.detach()
+
         """
 
         assert (
@@ -2085,6 +2088,7 @@ class Variable(object):
     @property
     def T(self):
         """
+
         Permute current Variable with its dimensions reversed.
 
         If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
@@ -2103,6 +2107,7 @@ class Variable(object):
                 x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
                 print(x_T_np.shape)
                 # (5, 3, 2)
+
         """
         if len(self.shape) == 1:
             return self
@@ -2141,7 +2146,7 @@ class Variable(object):
         as ``out = assign(tensor)`` .
 
         Returns:
-            Variable: The cloned Variable.
+            Variable, The cloned Variable.
 
         Examples:
             .. code-block:: python
@@ -2171,6 +2176,7 @@ class Variable(object):
 
     def _set_error_clip(self, error_clip):
         """
+
         Set the error_clip.
 
         Args:
@@ -2178,11 +2184,13 @@ class Variable(object):
 
         Returns:
             None
+
         """
         self.error_clip = error_clip
 
     def _set_info(self, key, value):
         """
+
         Set key-value information for this variable.
 
         Args:
@@ -2191,6 +2199,7 @@ class Variable(object):
 
         Returns:
             None
+
         """
         if not hasattr(self, "_info"):
             self._info = {}
@@ -2198,6 +2207,7 @@ class Variable(object):
 
     def _get_info(self, key):
         """
+
         Get the information of this variable corresponding to key.
 
         Args:
@@ -2205,6 +2215,7 @@ class Variable(object):
 
         Returns:
             object
+
         """
         if hasattr(self, "_info") and key in self._info:
             return self._info[key]
@@ -2212,7 +2223,9 @@ class Variable(object):
 
     def _slice_indices(self, slice, length):
         """
+
         Reference implementation for the slice.indices method.
+
         """
         # Compute step and length as integers.
         step = 1 if slice.step is None else slice.step
@@ -2383,7 +2396,7 @@ class Variable(object):
                 Default: None
 
         Returns:
-            Tensor: the value in given scope.
+            Tensor, the value in given scope.
 
         Examples:
             .. code-block:: python
@@ -2438,6 +2451,7 @@ class Variable(object):
 
     def set_value(self, value, scope=None):
         '''
+
         Set the value to the tensor in given scope.
 
         Args:
@@ -2477,6 +2491,7 @@ class Variable(object):
                     if var.persistable:
                         t_load = paddle.load(path+var.name+'.pdtensor')
                         var.set_value(t_load)
+
         '''
 
         # The 'framework' is a low-level module, and 'executor'
@@ -2547,10 +2562,11 @@ class Variable(object):
 
     def size(self):
         """
+
         Returns the number of elements for current Variable, which is a int64 Variable with shape [1]
 
         Returns:
-            Variable: the number of elements for current Variable
+            Variable, the number of elements for current Variable
 
         Examples:
             .. code-block:: python
@@ -2564,6 +2580,7 @@ class Variable(object):
 
                 # get the number of elements of the Variable
                 y = x.size()
+
         """
 
         output = self.block.create_var(
@@ -2578,23 +2595,27 @@ class Variable(object):
 
     def _set_attr(self, name, val):
         """
+
         Set the value of attribute by attribute's name.
 
         Args:
             name(str): the attribute name.
             val(int|str|list): the value of the attribute.
+
         """
         self._update_desc_attr(name, val)
 
     def _has_attr(self, name):
         """
+
         Whether this Variable has the attribute with the name `name` or not.
 
         Args:
             name(str): the attribute name.
 
         Returns:
-            bool: True if has this attribute.
+            bool, True if has this attribute.
+
         """
         return self.desc.has_attr(name)
 
@@ -2624,7 +2645,7 @@ class Variable(object):
             name(str): the attribute name.
 
         Returns:
-            int|str|list: The attribute value. The return value
+            int|str|list, The attribute value. The return value
             can be any valid attribute type.
         """
         return self.desc.attr(name)
@@ -3196,14 +3217,16 @@ class Operator(object):
 
     def input(self, name):
         r"""
+
         Get the input arguments according to the input parameter name.
 
         Args:
             name(str): The input parameter name.
 
         Returns:
-            list: return the list of argument names that associated with \
+            list, return the list of argument names that associated with \
                 the specific parameter name.
+
         """
         return self.desc.input(name)
 
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 736213340e90274aaf03c55ca98bc780c915a264..1df224ed05048261e17f4ca6885a7a36d62fe02c 100755
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,13 @@ from __future__ import print_function
 import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
-from ..framework import Variable, _non_static_mode, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Variable,
+    _non_static_mode,
+    _varbase_creator,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from .. import core
 from ..param_attr import ParamAttr
 from . import nn
@@ -33,22 +39,29 @@ __all__ = ['accuracy', 'auc']
 
 def accuracy(input, label, k=1, correct=None, total=None):
     """
+
     accuracy layer.
     Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
     This function computes the accuracy using the input and label.
     If the correct label occurs in top k predictions, then correct will increment by one.
-    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+
+    Note:
+        the dtype of accuracy is determined by input. the input and label dtype can be different.
+
     Args:
         input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
             The shape is ``[sample_number, class_dim]`` .
         label(Tensor): The label of dataset.  Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
-        k(int): The top k predictions for each class will be checked. Data type is int64 or int32.
-        correct(Tensor): The correct predictions count. A Tensor with type int64 or int32.
-        total(Tensor): The total entries count. A tensor with type int64 or int32.
+        k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1.
+        correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None.
+        total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None.
+
     Returns:
-        Tensor: The correct rate. A Tensor with type float32.
+        Tensor, The correct rate. A Tensor with type float32.
+
     Examples:
         .. code-block:: python
+
             import numpy as np
             import paddle
             import paddle.static as static
@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
                         fetch_list=[result[0]])
             print(output)
             #[array([0.], dtype=float32)]
+
     """
     if _non_static_mode():
         if correct is None:
@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None):
             total = _varbase_creator(dtype="int32")
 
         _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k,
-                                                        'sorted', False)
-        _acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
-                                            correct, total)
+        topk_out, topk_indices = _legacy_C_ops.top_k_v2(
+            input, 'k', _k, 'sorted', False
+        )
+        _acc, _, _ = _legacy_C_ops.accuracy(
+            topk_out, topk_indices, label, correct, total
+        )
         return _acc
 
     helper = LayerHelper("accuracy", **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'accuracy')
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'accuracy'
+    )
     topk_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     topk_indices = helper.create_variable_for_type_inference(dtype="int64")
     inputs = {"X": [input]}
@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None):
     else:
         attrs = {'k': k}
     attrs['sorted'] = False
-    helper.append_op(type="top_k_v2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": [topk_out],
-                         "Indices": [topk_indices]
-                     })
+    helper.append_op(
+        type="top_k_v2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": [topk_out], "Indices": [topk_indices]},
+    )
     acc_out = helper.create_variable_for_type_inference(dtype="float32")
     if correct is None:
         correct = helper.create_variable_for_type_inference(dtype="int32")
     if total is None:
         total = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(type="accuracy",
-                     inputs={
-                         "Out": [topk_out],
-                         "Indices": [topk_indices],
-                         "Label": [label]
-                     },
-                     outputs={
-                         "Accuracy": [acc_out],
-                         "Correct": [correct],
-                         "Total": [total],
-                     })
+    helper.append_op(
+        type="accuracy",
+        inputs={"Out": [topk_out], "Indices": [topk_indices], "Label": [label]},
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        },
+    )
     return acc_out
 
 
-def auc(input,
-        label,
-        curve='ROC',
-        num_thresholds=2**12 - 1,
-        topk=1,
-        slide_steps=1,
-        ins_tag_weight=None):
+def auc(
+    input,
+    label,
+    curve='ROC',
+    num_thresholds=2**12 - 1,
+    topk=1,
+    slide_steps=1,
+    ins_tag_weight=None,
+):
     """
     **Area Under the Curve (AUC) Layer**
 
@@ -216,13 +232,14 @@ def auc(input,
     helper = LayerHelper("auc", **locals())
 
     if ins_tag_weight is None:
-        ins_tag_weight = tensor.fill_constant(shape=[1, 1],
-                                              dtype="float32",
-                                              value=1.0)
+        ins_tag_weight = tensor.fill_constant(
+            shape=[1, 1], dtype="float32", value=1.0
+        )
     check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc')
     check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc')
-    check_variable_and_dtype(ins_tag_weight, 'ins_tag_weight',
-                             ['float32', 'float64'], 'auc')
+    check_variable_and_dtype(
+        ins_tag_weight, 'ins_tag_weight', ['float32', 'float64'], 'auc'
+    )
     auc_out = helper.create_variable_for_type_inference(dtype="float64")
     batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
@@ -236,62 +253,71 @@ def auc(input,
     batch_stat_pos = helper.create_global_variable(
         persistable=True,
         dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
     batch_stat_neg = helper.create_global_variable(
         persistable=True,
         dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
 
     # for global auc
     # Needn't maintain the batch id
-    stat_pos = helper.create_global_variable(persistable=True,
-                                             dtype='int64',
-                                             shape=[1, num_thresholds + 1])
-    stat_neg = helper.create_global_variable(persistable=True,
-                                             dtype='int64',
-                                             shape=[1, num_thresholds + 1])
+    stat_pos = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
+    )
+    stat_neg = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
+    )
 
     for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
-        helper.set_variable_initializer(var, Constant(value=0.0,
-                                                      force_cpu=False))
+        helper.set_variable_initializer(
+            var, Constant(value=0.0, force_cpu=False)
+        )
 
-    #"InsTagWeight": [ins_tag_weight]
+    # "InsTagWeight": [ins_tag_weight]
     # Batch AUC
-    helper.append_op(type="auc",
-                     inputs={
-                         "Predict": [input],
-                         "Label": [label],
-                         "StatPos": [batch_stat_pos],
-                         "StatNeg": [batch_stat_neg]
-                     },
-                     attrs={
-                         "curve": curve,
-                         "num_thresholds": num_thresholds,
-                         "slide_steps": slide_steps
-                     },
-                     outputs={
-                         "AUC": [batch_auc_out],
-                         "StatPosOut": [batch_stat_pos],
-                         "StatNegOut": [batch_stat_neg]
-                     })
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [batch_stat_pos],
+            "StatNeg": [batch_stat_neg],
+        },
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": slide_steps,
+        },
+        outputs={
+            "AUC": [batch_auc_out],
+            "StatPosOut": [batch_stat_pos],
+            "StatNegOut": [batch_stat_neg],
+        },
+    )
     # Global AUC
-    helper.append_op(type="auc",
-                     inputs={
-                         "Predict": [input],
-                         "Label": [label],
-                         "StatPos": [stat_pos],
-                         "StatNeg": [stat_neg]
-                     },
-                     attrs={
-                         "curve": curve,
-                         "num_thresholds": num_thresholds,
-                         "slide_steps": 0
-                     },
-                     outputs={
-                         "AUC": [auc_out],
-                         "StatPosOut": [stat_pos],
-                         "StatNegOut": [stat_neg]
-                     })
-    return auc_out, batch_auc_out, [
-        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
-    ]
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [stat_pos],
+            "StatNeg": [stat_neg],
+        },
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": 0,
+        },
+        outputs={
+            "AUC": [auc_out],
+            "StatPosOut": [stat_pos],
+            "StatNegOut": [stat_neg],
+        },
+    )
+    return (
+        auc_out,
+        batch_auc_out,
+        [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg],
+    )
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index b39284242ec088db9bd973baaa92d06690545ee5..49180f8c9670fb648257edaace136fead6c66a8c 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -27,18 +27,39 @@ import paddle
 from ..layer_helper import LayerHelper
 from paddle.fluid.framework import _in_legacy_dygraph
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, _non_static_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Variable,
+    OpProtoHolder,
+    _non_static_mode,
+    dygraph_only,
+    _dygraph_tracer,
+    default_main_program,
+    _varbase_creator,
+    static_only,
+    _global_flags,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from ..framework import _current_expected_place
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
-from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
+from .layer_function_generator import (
+    autodoc,
+    templatedoc,
+    _generate_doc_string_,
+)
 from .tensor import concat, assign, fill_constant, zeros, tensor_array_to_tensor
 from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
 from ...utils import deprecated
-from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..data_feeder import (
+    convert_dtype,
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 import paddle
 from paddle.utils import deprecated
 from paddle import _C_ops, _legacy_C_ops
@@ -210,13 +231,9 @@ OP_NAMEMAPPING = {
 
 
 @dygraph_only
-def _elementwise_op_in_dygraph(x,
-                               y,
-                               axis=-1,
-                               act=None,
-                               use_mkldnn=False,
-                               op_name=None):
-
+def _elementwise_op_in_dygraph(
+    x, y, axis=-1, act=None, use_mkldnn=False, op_name=None
+):
     def is_inplace(op_name):
         return op_name[-1] == "_"
 
@@ -227,24 +244,27 @@ def _elementwise_op_in_dygraph(x,
         if in_dygraph_mode():
             op = getattr(
                 _C_ops,
-                OP_NAMEMAPPING[op_name] if not is_inplace(op_name) else op_name)
+                OP_NAMEMAPPING[op_name] if not is_inplace(op_name) else op_name,
+            )
             out = op(x, y)
 
         if _in_legacy_dygraph():
             op = getattr(_legacy_C_ops, op_name)
             out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
-    return dygraph_utils._append_activation_in_dygraph(out,
-                                                       act,
-                                                       use_mkldnn=use_mkldnn)
-
-
-def fc(input,
-       size,
-       num_flatten_dims=1,
-       param_attr=None,
-       bias_attr=None,
-       act=None,
-       name=None):
+    return dygraph_utils._append_activation_in_dygraph(
+        out, act, use_mkldnn=use_mkldnn
+    )
+
+
+def fc(
+    input,
+    size,
+    num_flatten_dims=1,
+    param_attr=None,
+    bias_attr=None,
+    act=None,
+    name=None,
+):
     r"""
     :api_attr: Static Graph
 
@@ -362,8 +382,9 @@ def fc(input,
         for i, input_x in enumerate(input):
             check_type(input_x, 'input[' + str(i) + ']', Variable, 'fc')
     dtype = helper.input_dtype()
-    check_dtype(dtype, 'input', ['float16', 'uint16', 'float32', 'float64'],
-                'fc')
+    check_dtype(
+        dtype, 'input', ['float16', 'uint16', 'float32', 'float64'], 'fc'
+    )
     mul_results = []
     for input_var, param_attr in helper.iter_inputs_and_params():
         input_shape = input_var.shape
@@ -373,31 +394,28 @@ def fc(input,
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
 
-        w = helper.create_parameter(attr=param_attr,
-                                    shape=param_shape,
-                                    dtype=dtype,
-                                    is_bias=False)
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False
+        )
         tmp = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type="mul",
-                         inputs={
-                             "X": input_var,
-                             "Y": w
-                         },
-                         outputs={"Out": tmp},
-                         attrs={
-                             "x_num_col_dims": num_flatten_dims,
-                             "y_num_col_dims": 1
-                         })
+        helper.append_op(
+            type="mul",
+            inputs={"X": input_var, "Y": w},
+            outputs={"Out": tmp},
+            attrs={"x_num_col_dims": num_flatten_dims, "y_num_col_dims": 1},
+        )
         mul_results.append(tmp)
 
     if len(mul_results) == 1:
         pre_bias = mul_results[0]
     else:
         pre_bias = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type="sum",
-                         inputs={"X": mul_results},
-                         outputs={"Out": pre_bias},
-                         attrs={"use_mkldnn": False})
+        helper.append_op(
+            type="sum",
+            inputs={"X": mul_results},
+            outputs={"Out": pre_bias},
+            attrs={"use_mkldnn": False},
+        )
     # add bias
     pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
     # add activation
@@ -405,13 +423,15 @@ def fc(input,
 
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.embedding")
-def embedding(input,
-              size,
-              is_sparse=False,
-              is_distributed=False,
-              padding_idx=None,
-              param_attr=None,
-              dtype='float32'):
+def embedding(
+    input,
+    size,
+    is_sparse=False,
+    is_distributed=False,
+    padding_idx=None,
+    param_attr=None,
+    dtype='float32',
+):
     r"""
     :api_attr: Static Graph
 
@@ -507,7 +527,7 @@ def embedding(input,
           import numpy as np
           import paddle
           paddle.enable_static()
-          
+
           data = fluid.data(name='x', shape=[None, 1], dtype='int64')
 
           # example 1
@@ -524,10 +544,15 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
-    check_variable_and_dtype(input, 'input', ['int64'],
-                             'fluid.layers.embedding')
-    check_dtype(dtype, 'dtype', ['uint16', 'float16', 'float32', 'float64'],
-                'fluid.layers.embedding')
+    check_variable_and_dtype(
+        input, 'input', ['int64'], 'fluid.layers.embedding'
+    )
+    check_dtype(
+        dtype,
+        'dtype',
+        ['uint16', 'float16', 'float32', 'float64'],
+        'fluid.layers.embedding',
+    )
 
     if is_distributed:
         is_distributed = False
@@ -537,37 +562,42 @@ def embedding(input,
 
     remote_prefetch = True if is_sparse else False
 
-    w = helper.create_parameter(attr=helper.param_attr,
-                                shape=size,
-                                dtype=dtype,
-                                is_bias=False)
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False
+    )
     tmp = helper.create_variable_for_type_inference(dtype)
-    padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
-        size[0] + padding_idx)
-    helper.append_op(type='lookup_table',
-                     inputs={
-                         'Ids': input,
-                         'W': w
-                     },
-                     outputs={'Out': tmp},
-                     attrs={
-                         'is_sparse': is_sparse,
-                         'is_distributed': is_distributed,
-                         'remote_prefetch': remote_prefetch,
-                         'padding_idx': padding_idx
-                     })
+    padding_idx = (
+        -1
+        if padding_idx is None
+        else padding_idx
+        if padding_idx >= 0
+        else (size[0] + padding_idx)
+    )
+    helper.append_op(
+        type='lookup_table',
+        inputs={'Ids': input, 'W': w},
+        outputs={'Out': tmp},
+        attrs={
+            'is_sparse': is_sparse,
+            'is_distributed': is_distributed,
+            'remote_prefetch': remote_prefetch,
+            'padding_idx': padding_idx,
+        },
+    )
     return tmp
 
 
-def _pull_sparse(input,
-                 size,
-                 table_id,
-                 accessor_class,
-                 name="embedding",
-                 ctr_label_name="",
-                 padding_id=0,
-                 dtype='float32',
-                 scale_sparse_grad=True):
+def _pull_sparse(
+    input,
+    size,
+    table_id,
+    accessor_class,
+    name="embedding",
+    ctr_label_name="",
+    padding_id=0,
+    dtype='float32',
+    scale_sparse_grad=True,
+):
     r"""
     **Pull Fleet Sparse Layer**
 
@@ -614,35 +644,34 @@ def _pull_sparse(input,
         'ScaleSparseGrad': scale_sparse_grad,
         'InputNames': input_names,
         # this is only for compatible with embedding op
-        'is_distributed': True
+        'is_distributed': True,
     }
     # this is only for compatible with embedding op
-    w, _ = helper.create_or_get_global_variable(name=name,
-                                                shape=[size],
-                                                dtype=dtype,
-                                                is_bias=False,
-                                                persistable=True)
-    helper.append_op(type='pull_sparse',
-                     inputs={
-                         'Ids': inputs,
-                         'W': w
-                     },
-                     outputs={'Out': outs},
-                     attrs=attrs)
+    w, _ = helper.create_or_get_global_variable(
+        name=name, shape=[size], dtype=dtype, is_bias=False, persistable=True
+    )
+    helper.append_op(
+        type='pull_sparse',
+        inputs={'Ids': inputs, 'W': w},
+        outputs={'Out': outs},
+        attrs=attrs,
+    )
     if len(outs) == 1:
         return outs[0]
     return outs
 
 
-def _pull_sparse_v2(input,
-                    size,
-                    table_id,
-                    accessor_class,
-                    name="embedding",
-                    ctr_label_name="",
-                    padding_id=0,
-                    dtype='float32',
-                    scale_sparse_grad=True):
+def _pull_sparse_v2(
+    input,
+    size,
+    table_id,
+    accessor_class,
+    name="embedding",
+    ctr_label_name="",
+    padding_id=0,
+    dtype='float32',
+    scale_sparse_grad=True,
+):
     r"""
     **Pull Fleet Sparse Layer**
 
@@ -689,31 +718,26 @@ def _pull_sparse_v2(input,
         'ScaleSparseGrad': scale_sparse_grad,
         'InputNames': input_names,
         # this is only for compatible with embedding op
-        'is_distributed': True
+        'is_distributed': True,
     }
     # this is only for compatible with embedding op
-    w, _ = helper.create_or_get_global_variable(name=name,
-                                                shape=[size],
-                                                dtype=dtype,
-                                                is_bias=False,
-                                                persistable=True)
-    helper.append_op(type='pull_sparse_v2',
-                     inputs={
-                         'Ids': inputs,
-                         'W': w
-                     },
-                     outputs={'Out': outs},
-                     attrs=attrs)
+    w, _ = helper.create_or_get_global_variable(
+        name=name, shape=[size], dtype=dtype, is_bias=False, persistable=True
+    )
+    helper.append_op(
+        type='pull_sparse_v2',
+        inputs={'Ids': inputs, 'W': w},
+        outputs={'Out': outs},
+        attrs=attrs,
+    )
     if len(outs) == 1:
         return outs[0]
     return outs
 
 
-def _pull_gpups_sparse(input,
-                       size,
-                       dtype='float32',
-                       is_distributed=False,
-                       is_sparse=False):
+def _pull_gpups_sparse(
+    input, size, dtype='float32', is_distributed=False, is_sparse=False
+):
     r"""
     **Pull GpuPS Sparse Layer**
 
@@ -747,39 +771,36 @@ def _pull_gpups_sparse(input,
     helper = LayerHelper('pull_gpups_sparse', **locals())
     if dtype != 'float32':
         raise ValueError(
-            "GpuPS only support float type embedding now, and your type is: " +
-            dtype)
+            "GpuPS only support float type embedding now, and your type is: "
+            + dtype
+        )
     helper.input_dtype()
     inputs = helper.multiple_input()
     outs = [
         helper.create_variable_for_type_inference(dtype)
         for i in range(len(inputs))
     ]
-    w = helper.create_parameter(attr=helper.param_attr,
-                                shape=[size[0]],
-                                dtype=dtype,
-                                is_bias=False)
-    helper.append_op(type='pull_gpups_sparse',
-                     inputs={
-                         'Ids': inputs,
-                         'W': w
-                     },
-                     outputs={'Out': outs},
-                     attrs={
-                         'size': size,
-                         'is_distributed': is_distributed,
-                         'is_sparse': is_sparse
-                     })
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=[size[0]], dtype=dtype, is_bias=False
+    )
+    helper.append_op(
+        type='pull_gpups_sparse',
+        inputs={'Ids': inputs, 'W': w},
+        outputs={'Out': outs},
+        attrs={
+            'size': size,
+            'is_distributed': is_distributed,
+            'is_sparse': is_sparse,
+        },
+    )
     if len(outs) == 1:
         return outs[0]
     return outs
 
 
-def _pull_box_sparse(input,
-                     size,
-                     dtype='float32',
-                     is_distributed=False,
-                     is_sparse=False):
+def _pull_box_sparse(
+    input, size, dtype='float32', is_distributed=False, is_sparse=False
+):
     r"""
     **Pull Box Sparse Layer**
 
@@ -809,29 +830,28 @@ def _pull_box_sparse(input,
     helper = LayerHelper('pull_box_sparse', **locals())
     if dtype != 'float32':
         raise ValueError(
-            "BoxPS only support float type embedding now, and your type is: " +
-            dtype)
+            "BoxPS only support float type embedding now, and your type is: "
+            + dtype
+        )
     helper.input_dtype()
     inputs = helper.multiple_input()
     outs = [
         helper.create_variable_for_type_inference(dtype)
         for i in range(len(inputs))
     ]
-    w = helper.create_parameter(attr=helper.param_attr,
-                                shape=[size],
-                                dtype=dtype,
-                                is_bias=False)
-    helper.append_op(type='pull_box_sparse',
-                     inputs={
-                         'Ids': inputs,
-                         'W': w
-                     },
-                     outputs={'Out': outs},
-                     attrs={
-                         'size': size,
-                         'is_distributed': is_distributed,
-                         'is_sparse': is_sparse
-                     })
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=[size], dtype=dtype, is_bias=False
+    )
+    helper.append_op(
+        type='pull_box_sparse',
+        inputs={'Ids': inputs, 'W': w},
+        outputs={'Out': outs},
+        attrs={
+            'size': size,
+            'is_distributed': is_distributed,
+            'is_sparse': is_sparse,
+        },
+    )
     if len(outs) == 1:
         return outs[0]
     return outs
@@ -927,37 +947,46 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
             print(transition)
 
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'linear_chain_crf')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'linear_chain_crf'
+    )
     check_variable_and_dtype(label, 'label', ['int64'], 'linear_chain_crf')
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[2] if length else input.shape[1]
-    transition = helper.create_parameter(attr=helper.param_attr,
-                                         shape=[size + 2, size],
-                                         dtype=helper.input_dtype())
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype(),
+    )
     alpha = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype()
+    )
     emission_exps = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype()
+    )
     transition_exps = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype()
+    )
     log_likelihood = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype()
+    )
     this_inputs = {
         "Emission": [input],
         "Transition": transition,
-        "Label": [label]
+        "Label": [label],
     }
     if length:
         this_inputs['Length'] = [length]
-    helper.append_op(type='linear_chain_crf',
-                     inputs=this_inputs,
-                     outputs={
-                         "Alpha": [alpha],
-                         "EmissionExps": [emission_exps],
-                         "TransitionExps": transition_exps,
-                         "LogLikelihood": log_likelihood
-                     })
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs=this_inputs,
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood,
+        },
+    )
 
     return log_likelihood
 
@@ -1013,18 +1042,22 @@ def crf_decoding(input, param_attr, label=None, length=None):
            crf_decode = paddle.static.nn.crf_decoding(input=emission, length=length,
                      param_attr=paddle.ParamAttr(name="crfw_pad"))
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'crf_decoding')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'crf_decoding'
+    )
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
     viterbi_path = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64)
+        dtype=core.VarDesc.VarType.INT64
+    )
     inputs = {"Emission": [input], "Transition": transition, "Label": label}
     if length:
         inputs['Length'] = length
-    helper.append_op(type='crf_decoding',
-                     inputs=inputs,
-                     outputs={"ViterbiPath": [viterbi_path]})
+    helper.append_op(
+        type='crf_decoding',
+        inputs=inputs,
+        outputs={"ViterbiPath": [viterbi_path]},
+    )
 
     return viterbi_path
 
@@ -1058,26 +1091,23 @@ def cos_sim(X, Y):
     out = helper.create_variable_for_type_inference(dtype=X.dtype)
     xnorm = helper.create_variable_for_type_inference(dtype=X.dtype)
     ynorm = helper.create_variable_for_type_inference(dtype=X.dtype)
-    helper.append_op(type='cos_sim',
-                     inputs={
-                         'X': [X],
-                         'Y': [Y]
-                     },
-                     outputs={
-                         'Out': [out],
-                         'XNorm': [xnorm],
-                         'YNorm': [ynorm]
-                     })
+    helper.append_op(
+        type='cos_sim',
+        inputs={'X': [X], 'Y': [Y]},
+        outputs={'Out': [out], 'XNorm': [xnorm], 'YNorm': [ynorm]},
+    )
     return out
 
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.dropout")
-def dropout(x,
-            dropout_prob,
-            is_test=None,
-            seed=None,
-            name=None,
-            dropout_implementation="downgrade_in_infer"):
+def dropout(
+    x,
+    dropout_prob,
+    is_test=None,
+    seed=None,
+    name=None,
+    dropout_implementation="downgrade_in_infer",
+):
     """
 
     Computes dropout.
@@ -1093,7 +1123,7 @@ def dropout(x,
     Args:
         x (Variable): The input tensor variable. The data type is float16 or float32 or float64.
         dropout_prob (float): Probability of setting units to zero.
-        is_test (bool): A flag indicating whether it is in test phrase or not. 
+        is_test (bool): A flag indicating whether it is in test phrase or not.
                         Default None, in dynamic graph, it use global tracer mode; in static graph, it means False.
         seed (int): A Python integer used to create random seeds. If this
                     parameter is set to None, a random seed is used.
@@ -1128,30 +1158,39 @@ def dropout(x,
 
             import paddle
             import paddle.fluid as fluid
-            
+
             paddle.enable_static()
             x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32")
             dropped = fluid.layers.dropout(x, dropout_prob=0.5)
     """
     if not isinstance(dropout_prob, (float, int, Variable)):
         raise TypeError(
-            "dropout_prob argument should be a number(int|float) or Variable")
+            "dropout_prob argument should be a number(int|float) or Variable"
+        )
     # fast return for p == 0
     if isinstance(dropout_prob, (int, float)) and dropout_prob == 0:
         return x
 
     if _non_static_mode():
-        if (seed is None
-                or seed == 0) and default_main_program().random_seed != 0:
+        if (
+            seed is None or seed == 0
+        ) and default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
         if is_test is None:
             is_test = not _dygraph_tracer()._train_mode
-        out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', dropout_prob,
-                                          'is_test', is_test, 'fix_seed', seed
-                                          is not None, 'seed',
-                                          seed if seed is not None else 0,
-                                          'dropout_implementation',
-                                          dropout_implementation)
+        out, mask = _legacy_C_ops.dropout(
+            x,
+            'dropout_prob',
+            dropout_prob,
+            'is_test',
+            is_test,
+            'fix_seed',
+            seed is not None,
+            'seed',
+            seed if seed is not None else 0,
+            'dropout_implementation',
+            dropout_implementation,
+        )
         return out
 
     def get_attrs(prog, dropout_prob, is_test, seed):
@@ -1159,8 +1198,10 @@ def dropout(x,
             seed = prog.random_seed
         if isinstance(dropout_prob, Variable) and not dropout_prob.shape != [1]:
             raise TypeError(
-                "Required dropout_prob.shape == [1] if type(dropout_prob) is Variable, but received dropout_prob.shape = {}"
-                .format(dropout_prob.shape))
+                "Required dropout_prob.shape == [1] if type(dropout_prob) is Variable, but received dropout_prob.shape = {}".format(
+                    dropout_prob.shape
+                )
+            )
         attrs = {
             'dropout_prob': dropout_prob,
             'is_test': is_test,
@@ -1171,32 +1212,35 @@ def dropout(x,
         return attrs
 
     helper = LayerHelper('dropout', **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'dropout')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'dropout'
+    )
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     mask = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+    )
 
     attrs = get_attrs(helper.main_program, dropout_prob, is_test, seed)
 
-    helper.append_op(type='dropout',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'Mask': [mask]
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='dropout',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'Mask': [mask]},
+        attrs=attrs,
+    )
     return out
 
 
 @templatedoc()
-def chunk_eval(input,
-               label,
-               chunk_scheme,
-               num_chunk_types,
-               excluded_chunk_types=None,
-               seq_length=None):
+def chunk_eval(
+    input,
+    label,
+    chunk_scheme,
+    num_chunk_types,
+    excluded_chunk_types=None,
+    seq_length=None,
+):
     r"""
     This operator computes the precision, recall and F1-score for chunk detection.
     It is often used in sequence tagging tasks, such as Named Entity Recognition(NER).
@@ -1315,30 +1359,39 @@ def chunk_eval(input,
     num_infer_chunks = helper.create_variable_for_type_inference(dtype="int64")
     num_label_chunks = helper.create_variable_for_type_inference(dtype="int64")
     num_correct_chunks = helper.create_variable_for_type_inference(
-        dtype="int64")
+        dtype="int64"
+    )
 
     this_input = {"Inference": [input], "Label": [label]}
 
     if seq_length is not None:
         this_input["SeqLength"] = [seq_length]
 
-    helper.append_op(type="chunk_eval",
-                     inputs=this_input,
-                     outputs={
-                         "Precision": [precision],
-                         "Recall": [recall],
-                         "F1-Score": [f1_score],
-                         "NumInferChunks": [num_infer_chunks],
-                         "NumLabelChunks": [num_label_chunks],
-                         "NumCorrectChunks": [num_correct_chunks]
-                     },
-                     attrs={
-                         "num_chunk_types": num_chunk_types,
-                         "chunk_scheme": chunk_scheme,
-                         "excluded_chunk_types": excluded_chunk_types or []
-                     })
-    return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
-            num_correct_chunks)
+    helper.append_op(
+        type="chunk_eval",
+        inputs=this_input,
+        outputs={
+            "Precision": [precision],
+            "Recall": [recall],
+            "F1-Score": [f1_score],
+            "NumInferChunks": [num_infer_chunks],
+            "NumLabelChunks": [num_label_chunks],
+            "NumCorrectChunks": [num_correct_chunks],
+        },
+        attrs={
+            "num_chunk_types": num_chunk_types,
+            "chunk_scheme": chunk_scheme,
+            "excluded_chunk_types": excluded_chunk_types or [],
+        },
+    )
+    return (
+        precision,
+        recall,
+        f1_score,
+        num_infer_chunks,
+        num_label_chunks,
+        num_correct_chunks,
+    )
 
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
@@ -1459,38 +1512,44 @@ def softmax(input, use_cudnn=True, name=None, axis=-1):
         return _C_ops.softmax(input, axis)
 
     if _non_static_mode():
-        return _legacy_C_ops.softmax(input, 'axis', axis, 'use_cudnn',
-                                     use_cudnn)
+        return _legacy_C_ops.softmax(
+            input, 'axis', axis, 'use_cudnn', use_cudnn
+        )
 
     inputs = {"X": [input]}
     attrs = {"axis": axis, "use_cudnn": use_cudnn}
 
     helper = LayerHelper('softmax', **locals())
-    check_variable_and_dtype(input, 'input/x',
-                             ['float16', 'float32', 'float64'], 'softmax')
+    check_variable_and_dtype(
+        input, 'input/x', ['float16', 'float32', 'float64'], 'softmax'
+    )
 
     dtype = helper.input_dtype()
     softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="softmax",
-                     inputs={"X": input},
-                     outputs={"Out": softmax_out},
-                     attrs=attrs)
+    helper.append_op(
+        type="softmax",
+        inputs={"X": input},
+        outputs={"Out": softmax_out},
+        attrs=attrs,
+    )
     return softmax_out
 
 
-def conv2d(input,
-           num_filters,
-           filter_size,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=None,
-           param_attr=None,
-           bias_attr=None,
-           use_cudnn=True,
-           act=None,
-           name=None,
-           data_format="NCHW"):
+def conv2d(
+    input,
+    num_filters,
+    filter_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=None,
+    param_attr=None,
+    bias_attr=None,
+    use_cudnn=True,
+    act=None,
+    name=None,
+    data_format="NCHW",
+):
     r"""
     :api_attr: Static Graph
 
@@ -1626,27 +1685,34 @@ def conv2d(input,
           print(conv2d.shape) # [-1, 2, 30, 30]
     """
 
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'conv2d')
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'conv2d'
+    )
     if len(input.shape) != 4:
-        raise ValueError("Input size should be 4, "
-                         "but received {}".format(len(input.shape)))
+        raise ValueError(
+            "Input size should be 4, "
+            "but received {}".format(len(input.shape))
+        )
     num_channels = input.shape[1]
     if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. Received "
-                         "Attr(use_cudnn): %s. " % str(use_cudnn))
+        raise ValueError(
+            "Attr(use_cudnn) should be True or False. Received "
+            "Attr(use_cudnn): %s. " % str(use_cudnn)
+        )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    channel_last = (data_format == "NHWC")
+    channel_last = data_format == "NHWC"
     num_channels = input.shape[3] if channel_last else input.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input(%s) should be defined. "
-            "Received: %s." % (str(input.shape), str(num_channels)))
+            "Received: %s." % (str(input.shape), str(num_channels))
+        )
     assert param_attr is not False, "param_attr should not be False here."
 
     if groups is None:
@@ -1654,27 +1720,35 @@ def conv2d(input,
     elif groups <= 0:
         raise ValueError(
             "the groups of input must be greater than 0, "
-            "but received the groups of input is {}".format(groups))
+            "but received the groups of input is {}".format(groups)
+        )
     else:
         if num_channels % groups != 0:
             raise ValueError(
                 "the channel of input must be divisible by groups,"
                 "received: the channel of input is {}, the shape of input is {}"
-                ", the groups is {}".format(num_channels, input.shape, groups))
+                ", the groups is {}".format(num_channels, input.shape, groups)
+            )
         num_filter_channels = num_channels // groups
 
     l_type = 'conv2d'
-    if (num_channels == groups and num_filters % num_channels == 0
-            and not use_cudnn):
+    if (
+        num_channels == groups
+        and num_filters % num_channels == 0
+        and not use_cudnn
+    ):
         l_type = 'depthwise_conv2d'
 
-    if (num_channels == groups and num_filters % num_channels == 0
-            and core.is_compiled_with_rocm()):
+    if (
+        num_channels == groups
+        and num_filters % num_channels == 0
+        and core.is_compiled_with_rocm()
+    ):
         l_type = 'depthwise_conv2d'
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
     if core.is_compiled_with_npu():
-        if (num_channels == groups and num_channels == num_filters):
+        if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
             l_type = 'conv2d'
@@ -1688,7 +1762,6 @@ def conv2d(input,
 
     # padding
     def _update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -1699,14 +1772,16 @@ def conv2d(input,
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:4]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
                 if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:3]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 4, 'padding')
@@ -1723,8 +1798,9 @@ def conv2d(input,
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding)
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0, 0]
@@ -1742,39 +1818,47 @@ def conv2d(input,
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
                 " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num))
-        std = (2.0 / filter_elem_num)**0.5
+                "filter size.".format(filter_elem_num)
+            )
+        std = (2.0 / filter_elem_num) ** 0.5
         return Normal(0.0, std, 0)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        default_initializer=_get_default_param_initializer())
+        default_initializer=_get_default_param_initializer(),
+    )
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
-    if (core.is_compiled_with_cuda() and paddle.fluid.get_flags(
-            "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+    if (
+        core.is_compiled_with_cuda()
+        and paddle.fluid.get_flags("FLAGS_conv2d_disable_cudnn")[
+            "FLAGS_conv2d_disable_cudnn"
+        ]
+    ):
         use_cudnn = False
 
-    helper.append_op(type=l_type,
-                     inputs={
-                         'Input': input,
-                         'Filter': filter_param,
-                     },
-                     outputs={"Output": pre_bias},
-                     attrs={
-                         'strides': stride,
-                         'paddings': padding,
-                         'dilations': dilation,
-                         'groups': groups,
-                         'use_cudnn': use_cudnn,
-                         'use_mkldnn': False,
-                         'fuse_relu_before_depthwise_conv': False,
-                         "padding_algorithm": padding_algorithm,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": data_format,
+        },
+    )
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -1784,19 +1868,21 @@ def conv2d(input,
     return helper.append_activation(pre_act)
 
 
-def conv3d(input,
-           num_filters,
-           filter_size,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=None,
-           param_attr=None,
-           bias_attr=None,
-           use_cudnn=True,
-           act=None,
-           name=None,
-           data_format="NCDHW"):
+def conv3d(
+    input,
+    num_filters,
+    filter_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=None,
+    param_attr=None,
+    bias_attr=None,
+    use_cudnn=True,
+    act=None,
+    name=None,
+    data_format="NCDHW",
+):
     r"""
     :api_attr: Static Graph
 
@@ -1939,37 +2025,46 @@ def conv3d(input,
     dtype = helper.input_dtype()
 
     if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. Received "
-                         "Attr(use_cudnn): %s. " % str(use_cudnn))
+        raise ValueError(
+            "Attr(use_cudnn) should be True or False. Received "
+            "Attr(use_cudnn): %s. " % str(use_cudnn)
+        )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    channel_last = (data_format == "NDHWC")
+    channel_last = data_format == "NDHWC"
     if len(input.shape) != 5:
         raise ValueError(
-            "Input should be 5D tensor, but received input with the shape of {}"
-            .format(input.shape))
+            "Input should be 5D tensor, but received input with the shape of {}".format(
+                input.shape
+            )
+        )
     num_channels = input.shape[4] if channel_last else input.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input(%s) should be defined. "
-            "Received: %s." % (str(input.shape), str(num_channels)))
+            "Received: %s." % (str(input.shape), str(num_channels))
+        )
 
     if groups is None:
         num_filter_channels = num_channels
     elif groups <= 0:
         raise ValueError(
-            "the groups of conv3d should be greater than 0. Received groups: {}"
-            .format(groups))
+            "the groups of conv3d should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     else:
         if num_channels % groups != 0:
             raise ValueError(
                 "The number of input channels must be divisible by Attr(groups). "
-                "Received: number of channels(%s), groups(%s)." %
-                (str(num_channels), str(groups)))
+                "Received: number of channels(%s), groups(%s)."
+                % (str(num_channels), str(groups))
+            )
         num_filter_channels = num_channels // groups
 
     filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
@@ -1977,7 +2072,6 @@ def conv3d(input,
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
 
     def _update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -1988,14 +2082,16 @@ def conv3d(input,
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:5]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
                 if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:4]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 6, 'padding')
@@ -2015,8 +2111,9 @@ def conv3d(input,
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding)
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0, 0, 0]
@@ -2030,41 +2127,46 @@ def conv3d(input,
     filter_shape = [num_filters, num_filter_channels] + filter_size
 
     def _get_default_param_initializer():
-        filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
-            2] * num_channels
+        filter_elem_num = (
+            filter_size[0] * filter_size[1] * filter_size[2] * num_channels
+        )
         if filter_elem_num <= 0:
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
                 " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num))
+                "filter size.".format(filter_elem_num)
+            )
 
-        std = (2.0 / filter_elem_num)**0.5
+        std = (2.0 / filter_elem_num) ** 0.5
         return Normal(0.0, std, 0)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        default_initializer=_get_default_param_initializer())
+        default_initializer=_get_default_param_initializer(),
+    )
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=l_type,
-                     inputs={
-                         'Input': input,
-                         'Filter': filter_param,
-                     },
-                     outputs={"Output": pre_bias},
-                     attrs={
-                         'strides': stride,
-                         'paddings': padding,
-                         'dilations': dilation,
-                         'groups': groups,
-                         'use_cudnn': use_cudnn,
-                         'use_mkldnn': False,
-                         "padding_algorithm": padding_algorithm,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": data_format,
+        },
+    )
 
     if data_format == 'NCDHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -2075,17 +2177,19 @@ def conv3d(input,
 
 
 @templatedoc()
-def pool2d(input,
-           pool_size=-1,
-           pool_type="max",
-           pool_stride=1,
-           pool_padding=0,
-           global_pooling=False,
-           use_cudnn=True,
-           ceil_mode=False,
-           name=None,
-           exclusive=True,
-           data_format="NCHW"):
+def pool2d(
+    input,
+    pool_size=-1,
+    pool_type="max",
+    pool_stride=1,
+    pool_padding=0,
+    global_pooling=False,
+    use_cudnn=True,
+    ceil_mode=False,
+    name=None,
+    exclusive=True,
+    data_format="NCHW",
+):
     """
 
     ${comment}
@@ -2196,27 +2300,31 @@ def pool2d(input,
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
+            str(pool_type),
+        )
 
     if global_pooling is False and pool_size == -1:
         raise ValueError(
             "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
-            "and be a valid value. Received pool_size: %s." % str(pool_size))
+            "and be a valid value. Received pool_size: %s." % str(pool_size)
+        )
 
     if not isinstance(use_cudnn, bool):
-        raise TypeError("Attr(use_cudnn) should be True or False. Received "
-                        "Attr(use_cudnn): %s." % str(use_cudnn))
+        raise TypeError(
+            "Attr(use_cudnn) should be True or False. Received "
+            "Attr(use_cudnn): %s." % str(use_cudnn)
+        )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
     pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
     pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
 
     def update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -2227,14 +2335,16 @@ def pool2d(input,
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:4]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
                 if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
                     raise ValueError(
                         "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:3]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 4, 'padding')
@@ -2252,61 +2362,77 @@ def pool2d(input,
         if pool_padding not in ["SAME", "VALID"]:
             raise ValueError(
                 "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(pool_padding))
+                % str(pool_padding)
+            )
         if pool_padding == "VALID":
             padding_algorithm = "VALID"
             pool_padding = [0, 0]
             if ceil_mode != False:
                 raise ValueError(
                     "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
+                    "Received ceil_mode: True."
+                )
         elif pool_padding == "SAME":
             padding_algorithm = "SAME"
             pool_padding = [0, 0]
 
     pool_padding = update_padding(pool_padding, data_format)
     if in_dygraph_mode():
-        return _C_ops.pool2d(input, pool_size, pool_stride, pool_padding,
-                             ceil_mode, exclusive, data_format, pool_type,
-                             global_pooling, False, padding_algorithm,
-                             use_cudnn)
+        return _C_ops.pool2d(
+            input,
+            pool_size,
+            pool_stride,
+            pool_padding,
+            ceil_mode,
+            exclusive,
+            data_format,
+            pool_type,
+            global_pooling,
+            False,
+            padding_algorithm,
+            use_cudnn,
+        )
     op_type = 'pool2d'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={"X": input},
-                     outputs={"Out": pool_out},
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "global_pooling": global_pooling,
-                         "strides": pool_stride,
-                         "paddings": pool_padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": use_cudnn,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": exclusive,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        },
+    )
 
     return pool_out
 
 
 @templatedoc()
-def pool3d(input,
-           pool_size=-1,
-           pool_type="max",
-           pool_stride=1,
-           pool_padding=0,
-           global_pooling=False,
-           use_cudnn=True,
-           ceil_mode=False,
-           name=None,
-           exclusive=True,
-           data_format="NCDHW"):
+def pool3d(
+    input,
+    pool_size=-1,
+    pool_type="max",
+    pool_stride=1,
+    pool_padding=0,
+    global_pooling=False,
+    use_cudnn=True,
+    ceil_mode=False,
+    name=None,
+    exclusive=True,
+    data_format="NCDHW",
+):
     """
 
     ${comment}
@@ -2423,28 +2549,32 @@ def pool3d(input,
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
+            str(pool_type),
+        )
 
     if global_pooling is False and pool_size == -1:
         raise ValueError(
             "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
-            "and be a valid value. Received Attr(pool_size): %s." %
-            str(pool_size))
+            "and be a valid value. Received Attr(pool_size): %s."
+            % str(pool_size)
+        )
 
     if not isinstance(use_cudnn, bool):
-        raise TypeError("Attr(use_cudnn) should be True or False. Received "
-                        "Attr(use_cudnn): %s. " % str(use_cudnn))
+        raise TypeError(
+            "Attr(use_cudnn) should be True or False. Received "
+            "Attr(use_cudnn): %s. " % str(use_cudnn)
+        )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
+            "Attr(data_format): %s" % str(data_format)
+        )
 
     pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
     pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
 
     def update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, (list, tuple)):
                 return True
@@ -2455,14 +2585,16 @@ def pool3d(input,
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:5]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
                 if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
                     raise ValueError(
                         "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:4]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 6, 'padding')
@@ -2484,14 +2616,16 @@ def pool3d(input,
         if pool_padding not in ["SAME", "VALID"]:
             raise ValueError(
                 "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(pool_padding))
+                % str(pool_padding)
+            )
         if pool_padding == "VALID":
             padding_algorithm = "VALID"
             pool_padding = [0, 0, 0]
             if ceil_mode != False:
                 raise ValueError(
                     "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
+                    "Received ceil_mode: True."
+                )
         elif pool_padding == "SAME":
             padding_algorithm = "SAME"
             pool_padding = [0, 0, 0]
@@ -2503,33 +2637,33 @@ def pool3d(input,
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={"X": input},
-                     outputs={"Out": pool_out},
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "global_pooling": global_pooling,
-                         "strides": pool_stride,
-                         "paddings": pool_padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": use_cudnn,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": exclusive,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        },
+    )
 
     return pool_out
 
 
 @deprecated(since="2.0.0")
 @templatedoc(op_type="pool2d")
-def adaptive_pool2d(input,
-                    pool_size,
-                    pool_type="max",
-                    require_index=False,
-                    name=None):
+def adaptive_pool2d(
+    input, pool_size, pool_type="max", require_index=False, name=None
+):
     r"""
 
     This operation calculates the output based on the input, pool_size,
@@ -2626,19 +2760,24 @@ def adaptive_pool2d(input,
                             pool_type='max')
     """
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'adaptive_pool2d')
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'adaptive_pool2d',
+    )
     check_type(pool_type, 'pool_type', str, 'adaptive_pool2d')
     check_type(pool_size, 'pool_size', (int, list, tuple), 'adaptive_pool2d')
     check_type(require_index, 'require_index', bool, 'adaptive_pool2d')
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
+            str(pool_type),
+        )
 
     if pool_type == "avg" and require_index:
         raise ValueError(
-            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'."
+        )
 
     pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
 
@@ -2656,25 +2795,25 @@ def adaptive_pool2d(input,
         mask = helper.create_variable_for_type_inference(dtype)
         outputs["Mask"] = mask
 
-    helper.append_op(type=l_type,
-                     inputs={"X": input},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        },
+    )
 
     return (pool_out, mask) if require_index else pool_out
 
 
 @deprecated(since="2.0.0")
 @templatedoc(op_type="pool3d")
-def adaptive_pool3d(input,
-                    pool_size,
-                    pool_type="max",
-                    require_index=False,
-                    name=None):
+def adaptive_pool3d(
+    input, pool_size, pool_type="max", require_index=False, name=None
+):
     r"""
 
     This operation calculates the output based on the input, pool_size,
@@ -2785,19 +2924,24 @@ def adaptive_pool3d(input,
                             pool_type='max')
     """
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'adaptive_pool3d')
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'adaptive_pool3d',
+    )
     check_type(pool_type, 'pool_type', str, 'adaptive_pool3d')
     check_type(pool_size, 'pool_size', (int, list, tuple), 'adaptive_pool3d')
     check_type(require_index, 'require_index', bool, 'adaptive_pool3d')
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
+            str(pool_type),
+        )
 
     if pool_type == "avg" and require_index:
         raise ValueError(
-            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'."
+        )
 
     pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
 
@@ -2815,32 +2959,36 @@ def adaptive_pool3d(input,
         mask = helper.create_variable_for_type_inference(dtype)
         outputs["Mask"] = mask
 
-    helper.append_op(type=l_type,
-                     inputs={"X": input},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        },
+    )
 
     return (pool_out, mask) if require_index else pool_out
 
 
-def batch_norm(input,
-               act=None,
-               is_test=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               data_layout='NCHW',
-               in_place=False,
-               name=None,
-               moving_mean_name=None,
-               moving_variance_name=None,
-               do_model_average_for_mean_and_var=True,
-               use_global_stats=False):
+def batch_norm(
+    input,
+    act=None,
+    is_test=False,
+    momentum=0.9,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    data_layout='NCHW',
+    in_place=False,
+    name=None,
+    moving_mean_name=None,
+    moving_variance_name=None,
+    do_model_average_for_mean_and_var=True,
+    use_global_stats=False,
+):
     r"""
     :api_attr: Static Graph
 
@@ -2955,11 +3103,14 @@ def batch_norm(input,
             print(hidden2.shape)
             # [3, 200]
     """
-    assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+    assert (
+        bias_attr is not False
+    ), "bias_attr should not be False in batch_norm."
     helper = LayerHelper('batch_norm', **locals())
 
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'batch_norm')
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'batch_norm'
+    )
     dtype = helper.input_dtype()
 
     # use fp32 for bn parameter
@@ -2978,31 +3129,38 @@ def batch_norm(input,
     param_shape = [channel_num]
 
     # create parameter
-    scale = helper.create_parameter(attr=helper.param_attr,
-                                    shape=param_shape,
-                                    dtype=dtype,
-                                    default_initializer=Constant(1.0))
-    bias = helper.create_parameter(attr=helper.bias_attr,
-                                   shape=param_shape,
-                                   dtype=dtype,
-                                   is_bias=True)
-
-    mean = helper.create_parameter(attr=ParamAttr(
-        name=moving_mean_name,
-        initializer=Constant(0.0),
-        trainable=False,
-        do_model_average=do_model_average_for_mean_and_var),
-                                   shape=param_shape,
-                                   dtype=dtype)
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0),
+    )
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+    )
+
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
     mean.stop_gradient = True
 
-    variance = helper.create_parameter(attr=ParamAttr(
-        name=moving_variance_name,
-        initializer=Constant(1.0),
-        trainable=False,
-        do_model_average=do_model_average_for_mean_and_var),
-                                       shape=param_shape,
-                                       dtype=dtype)
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
     variance.stop_gradient = True
 
     # create output
@@ -3022,38 +3180,81 @@ def batch_norm(input,
 
         attrs_ = ()
         if attrs_has_momentum:
-            attrs_ = ('momentum', momentum, 'epsilon', epsilon, 'is_test',
-                      is_test, 'data_layout', data_layout, 'use_mkldnn', False,
-                      'fuse_with_relu', False, 'use_global_stats',
-                      use_global_stats)
+            attrs_ = (
+                'momentum',
+                momentum,
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+            )
         else:
-            attrs_ = ('epsilon', epsilon, 'is_test', is_test, 'data_layout',
-                      data_layout, 'use_mkldnn', False, 'fuse_with_relu', False,
-                      'use_global_stats', use_global_stats)
+            attrs_ = (
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+            )
         if inputs_has_MomemtumTensor:
             batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                input, scale, bias, mean, variance, momentum, mean_out,
-                variance_out, *attrs_)
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                momentum,
+                mean_out,
+                variance_out,
+                *attrs_,
+            )
         else:
             batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                input, scale, bias, mean, variance, None, mean_out,
-                variance_out, *attrs_)
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                None,
+                mean_out,
+                variance_out,
+                *attrs_,
+            )
 
-        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
-                                                           act=act,
-                                                           use_mkldnn=False)
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=act, use_mkldnn=False
+        )
 
-    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True
+    )
     reserve_space = None
     if not is_test:
         reserve_space = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype(), stop_gradient=True)
+            dtype=helper.input_dtype(), stop_gradient=True
+        )
 
-    batch_norm_out = input if in_place else \
-            helper.create_variable_for_type_inference(dtype)
+    batch_norm_out = (
+        input if in_place else helper.create_variable_for_type_inference(dtype)
+    )
 
     inputs = {
         "X": input,
@@ -3062,7 +3263,7 @@ def batch_norm(input,
         "Mean": mean,
         "Variance": variance,
         "MeanOut": mean_out,
-        "VarianceOut": variance_out
+        "VarianceOut": variance_out,
     }
     attrs = {
         "epsilon": epsilon,
@@ -3070,7 +3271,7 @@ def batch_norm(input,
         "data_layout": data_layout,
         "use_mkldnn": False,
         "fuse_with_relu": False,
-        "use_global_stats": use_global_stats
+        "use_global_stats": use_global_stats,
     }
     if isinstance(momentum, Variable):
         inputs['MomemtumTensor'] = momentum
@@ -3082,33 +3283,34 @@ def batch_norm(input,
         "MeanOut": mean_out,
         "VarianceOut": variance_out,
         "SavedMean": saved_mean,
-        "SavedVariance": saved_variance
+        "SavedVariance": saved_variance,
     }
     if reserve_space is not None:
         outputs["ReserveSpace"] = reserve_space
 
-    helper.append_op(type="batch_norm",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+    )
 
     return helper.append_activation(batch_norm_out)
 
 
-def inplace_abn(input,
-                act=None,
-                is_test=False,
-                momentum=0.9,
-                epsilon=1e-05,
-                param_attr=None,
-                bias_attr=None,
-                data_layout='NCHW',
-                name=None,
-                moving_mean_name=None,
-                moving_variance_name=None,
-                do_model_average_for_mean_and_var=True,
-                use_global_stats=False,
-                act_alpha=1.0):
+def inplace_abn(
+    input,
+    act=None,
+    is_test=False,
+    momentum=0.9,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    data_layout='NCHW',
+    name=None,
+    moving_mean_name=None,
+    moving_variance_name=None,
+    do_model_average_for_mean_and_var=True,
+    use_global_stats=False,
+    act_alpha=1.0,
+):
     r"""
     **In-place Activation Batch Normalization Layer**
 
@@ -3142,14 +3344,14 @@ def inplace_abn(input,
             numerical stability. Default is 1e-5.
         param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
              of inplace_abn. If it is set to None or one attribute of ParamAttr, inplace_abn
-	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized
-	     with Xavier. Default: None.
+             will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+             If the Initializer of the param_attr is not set, the parameter is initialized
+             with Xavier. Default: None.
         bias_attr(ParamAttr|None): The parameter attribute for the bias of inplace_abn.
              If it is set to None or one attribute of ParamAttr, inplace_abn
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
-	     Default: None.
+             will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+             If the Initializer of the bias_attr is not set, the bias is initialized zero.
+             Default: None.
         data_layout (str, optional): Specify the data format of the input, and the data format of the output
              will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
              The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -3187,14 +3389,18 @@ def inplace_abn(input,
             hidden3 = fluid.layers.inplace_abn(input=hidden2, act='leaky_relu', act_alpha=0.2)
 
     """
-    assert act in [None, 'identity', 'leaky_relu', 'elu'], \
-        "inplace_abn only support act as None, 'identity', " \
+    assert act in [None, 'identity', 'leaky_relu', 'elu'], (
+        "inplace_abn only support act as None, 'identity', "
         "'leaky_relu', 'elu' currently"
-    assert bias_attr is not False, "bias_attr should not be False in inplace_abn."
+    )
+    assert (
+        bias_attr is not False
+    ), "bias_attr should not be False in inplace_abn."
     helper = LayerHelper('inplace_abn', **locals())
 
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'inplace_abn')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'inplace_abn'
+    )
     dtype = helper.input_dtype()
 
     input_shape = input.shape
@@ -3209,31 +3415,38 @@ def inplace_abn(input,
     param_shape = [channel_num]
 
     # create parameter
-    scale = helper.create_parameter(attr=helper.param_attr,
-                                    shape=param_shape,
-                                    dtype=dtype,
-                                    default_initializer=Constant(1.0))
-    bias = helper.create_parameter(attr=helper.bias_attr,
-                                   shape=param_shape,
-                                   dtype=dtype,
-                                   is_bias=True)
-
-    mean = helper.create_parameter(attr=ParamAttr(
-        name=moving_mean_name,
-        initializer=Constant(0.0),
-        trainable=False,
-        do_model_average=do_model_average_for_mean_and_var),
-                                   shape=param_shape,
-                                   dtype=dtype)
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0),
+    )
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+    )
+
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
     mean.stop_gradient = True
 
-    variance = helper.create_parameter(attr=ParamAttr(
-        name=moving_variance_name,
-        initializer=Constant(1.0),
-        trainable=False,
-        do_model_average=do_model_average_for_mean_and_var),
-                                       shape=param_shape,
-                                       dtype=dtype)
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
     variance.stop_gradient = True
 
     # create output
@@ -3255,39 +3468,88 @@ def inplace_abn(input,
 
         attrs__ = ()
         if attrs_has_momentum:
-            attrs__ = ('momentum', momentum, 'epsilon', epsilon, 'is_test',
-                       is_test, 'data_layout', data_layout, 'use_mkldnn', False,
-                       'fuse_with_relu', False, 'use_global_stats',
-                       use_global_stats, 'activation', act, 'alpha', act_alpha)
+            attrs__ = (
+                'momentum',
+                momentum,
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+                'activation',
+                act,
+                'alpha',
+                act_alpha,
+            )
         else:
-            attrs__ = ('epsilon', epsilon, 'is_test', is_test, 'data_layout',
-                       data_layout, 'use_mkldnn', False, 'fuse_with_relu',
-                       False, 'use_global_stats', use_global_stats,
-                       'activation', act, 'alpha', act_alpha)
+            attrs__ = (
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+                'activation',
+                act,
+                'alpha',
+                act_alpha,
+            )
         if inputs_has_MomemtumTensor:
             batch_norm_out, _, _, _, _, _ = _legacy_C_ops.inplace_abn_(
-                input, scale, bias, mean, variance, momentum, mean_out,
-                variance_out, *attrs__)
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                momentum,
+                mean_out,
+                variance_out,
+                *attrs__,
+            )
             return batch_norm_out
         else:
             batch_norm_out, _, _, _, _, _ = _legacy_C_ops.inplace_abn_(
-                input, scale, bias, mean, variance, None, mean_out,
-                variance_out, *attrs__)
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                None,
+                mean_out,
+                variance_out,
+                *attrs__,
+            )
             return batch_norm_out
 
-    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True
+    )
     reserve_space = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True
+    )
 
     inputs = {
         "X": input,
         "Scale": scale,
         "Bias": bias,
         "Mean": mean,
-        "Variance": variance
+        "Variance": variance,
     }
     attrs = {
         "epsilon": epsilon,
@@ -3308,24 +3570,21 @@ def inplace_abn(input,
         "MeanOut": mean_out,
         "VarianceOut": variance_out,
         "SavedMean": saved_mean,
-        "SavedVariance": saved_variance
+        "SavedVariance": saved_variance,
     }
     if reserve_space is not None:
         outputs["ReserveSpace"] = reserve_space
 
-    helper.append_op(type="inplace_abn",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="inplace_abn", inputs=inputs, outputs=outputs, attrs=attrs
+    )
 
     return batch_norm_out
 
 
-def instance_norm(input,
-                  epsilon=1e-05,
-                  param_attr=None,
-                  bias_attr=None,
-                  name=None):
+def instance_norm(
+    input, epsilon=1e-05, param_attr=None, bias_attr=None, name=None
+):
     r"""
     :api_attr: Static Graph
 
@@ -3389,10 +3648,13 @@ def instance_norm(input,
             hidden1 = paddle.static.nn.fc(x, size=200)
             hidden2 = paddle.static.nn.instance_norm(hidden1)
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'instance_norm')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'instance_norm'
+    )
     if param_attr is False:
-        assert bias_attr is False, "param_attr and bias_attr must be set to Fasle at the same time in instance_norm"
+        assert (
+            bias_attr is False
+        ), "param_attr and bias_attr must be set to Fasle at the same time in instance_norm"
 
     helper = LayerHelper('instance_norm', **locals())
     dtype = helper.input_dtype()
@@ -3404,29 +3666,37 @@ def instance_norm(input,
     input_shape = input.shape
     if len(input.shape) < 2 or len(input.shape) > 5:
         raise ValueError(
-            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'
-            .format(len(input.shape), input_shape))
+            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'.format(
+                len(input.shape), input_shape
+            )
+        )
     channel_num = input_shape[1]
 
     param_shape = [channel_num]
 
     if param_attr != False and bias_attr != False:
         # create parameter
-        scale = helper.create_parameter(attr=helper.param_attr,
-                                        shape=param_shape,
-                                        dtype=dtype,
-                                        default_initializer=Constant(1.0))
-        bias = helper.create_parameter(attr=helper.bias_attr,
-                                       shape=param_shape,
-                                       dtype=dtype,
-                                       is_bias=True,
-                                       default_initializer=Constant(0.0))
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0),
+        )
+        bias = helper.create_parameter(
+            attr=helper.bias_attr,
+            shape=param_shape,
+            dtype=dtype,
+            is_bias=True,
+            default_initializer=Constant(0.0),
+        )
 
     # create output
-    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True
+    )
 
     instance_norm_out = helper.create_variable_for_type_inference(dtype)
 
@@ -3435,35 +3705,39 @@ def instance_norm(input,
         inputs["Scale"] = scale
         inputs["Bias"] = bias
 
-    helper.append_op(type="instance_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": instance_norm_out,
-                         "SavedMean": saved_mean,
-                         "SavedVariance": saved_variance
-                     },
-                     attrs={
-                         "epsilon": epsilon,
-                     })
+    helper.append_op(
+        type="instance_norm",
+        inputs=inputs,
+        outputs={
+            "Y": instance_norm_out,
+            "SavedMean": saved_mean,
+            "SavedVariance": saved_variance,
+        },
+        attrs={
+            "epsilon": epsilon,
+        },
+    )
 
     return instance_norm_out
 
 
 @static_only
-def data_norm(input,
-              act=None,
-              epsilon=1e-05,
-              param_attr=None,
-              data_layout='NCHW',
-              in_place=False,
-              name=None,
-              moving_mean_name=None,
-              moving_variance_name=None,
-              do_model_average_for_mean_and_var=True,
-              slot_dim=-1,
-              sync_stats=False,
-              summary_decay_rate=0.9999999,
-              enable_scale_and_shift=False):
+def data_norm(
+    input,
+    act=None,
+    epsilon=1e-05,
+    param_attr=None,
+    data_layout='NCHW',
+    in_place=False,
+    name=None,
+    moving_mean_name=None,
+    moving_variance_name=None,
+    do_model_average_for_mean_and_var=True,
+    slot_dim=-1,
+    sync_stats=False,
+    summary_decay_rate=0.9999999,
+    enable_scale_and_shift=False,
+):
     r"""
     :api_attr: Static Graph
 
@@ -3561,39 +3835,54 @@ def data_norm(input,
     if name == None:
         name = "dn"
     if enable_scale_and_shift:
-        scale_w = helper.create_parameter(attr=ParamAttr(
-            name=name + '.scale_w',
-            initializer=Constant(value=float(scale_w_default)),
-            trainable=True),
-                                          shape=param_shape,
-                                          dtype=input.dtype)
-        bias = helper.create_parameter(attr=ParamAttr(
-            name=name + '.bias',
-            initializer=Constant(value=float(bias_default)),
-            trainable=True),
-                                       shape=param_shape,
-                                       dtype=input.dtype)
+        scale_w = helper.create_parameter(
+            attr=ParamAttr(
+                name=name + '.scale_w',
+                initializer=Constant(value=float(scale_w_default)),
+                trainable=True,
+            ),
+            shape=param_shape,
+            dtype=input.dtype,
+        )
+        bias = helper.create_parameter(
+            attr=ParamAttr(
+                name=name + '.bias',
+                initializer=Constant(value=float(bias_default)),
+                trainable=True,
+            ),
+            shape=param_shape,
+            dtype=input.dtype,
+        )
     # create parameter
-    batch_size = helper.create_parameter(attr=ParamAttr(
-        name=name + '.batch_size',
-        initializer=Constant(value=float(batch_size_default)),
-        trainable=True),
-                                         shape=param_shape,
-                                         dtype=input.dtype)
-
-    batch_sum = helper.create_parameter(attr=ParamAttr(
-        name=name + '.batch_sum',
-        initializer=Constant(value=float(batch_sum_default)),
-        trainable=True),
-                                        shape=param_shape,
-                                        dtype=input.dtype)
-
-    batch_square_sum = helper.create_parameter(attr=ParamAttr(
-        name=name + '.batch_square_sum',
-        initializer=Constant(value=float(batch_square_sum_default)),
-        trainable=True),
-                                               shape=param_shape,
-                                               dtype=input.dtype)
+    batch_size = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_size',
+            initializer=Constant(value=float(batch_size_default)),
+            trainable=True,
+        ),
+        shape=param_shape,
+        dtype=input.dtype,
+    )
+
+    batch_sum = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_sum',
+            initializer=Constant(value=float(batch_sum_default)),
+            trainable=True,
+        ),
+        shape=param_shape,
+        dtype=input.dtype,
+    )
+
+    batch_square_sum = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_square_sum',
+            initializer=Constant(value=float(batch_square_sum_default)),
+            trainable=True,
+        ),
+        shape=param_shape,
+        dtype=input.dtype,
+    )
 
     means = helper.create_variable(dtype=dtype, stop_gradient=True)
     scales = helper.create_variable(dtype=dtype, stop_gradient=True)
@@ -3604,7 +3893,7 @@ def data_norm(input,
         "X": input,
         "BatchSize": batch_size,
         "BatchSum": batch_sum,
-        "BatchSquareSum": batch_square_sum
+        "BatchSquareSum": batch_square_sum,
     }
     attrs = {
         "epsilon": epsilon,
@@ -3619,31 +3908,35 @@ def data_norm(input,
     if enable_scale_and_shift:
         inputs["scale_w"] = scale_w
         inputs["bias"] = bias
-    helper.append_op(type="data_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": data_norm_out,
-                         "Means": means,
-                         "Scales": scales,
-                         "BatchSize": batch_size,
-                         "BatchSum": batch_sum,
-                         "BatchSquareSum": batch_square_sum
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type="data_norm",
+        inputs=inputs,
+        outputs={
+            "Y": data_norm_out,
+            "Means": means,
+            "Scales": scales,
+            "BatchSize": batch_size,
+            "BatchSum": batch_sum,
+            "BatchSquareSum": batch_square_sum,
+        },
+        attrs=attrs,
+    )
 
     return helper.append_activation(data_norm_out)
 
 
 @templatedoc()
-def layer_norm(input,
-               scale=True,
-               shift=True,
-               begin_norm_axis=1,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               act=None,
-               name=None):
+def layer_norm(
+    input,
+    scale=True,
+    shift=True,
+    begin_norm_axis=1,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    act=None,
+    name=None,
+):
     r"""
     :api_attr: Static Graph
 
@@ -3706,11 +3999,13 @@ def layer_norm(input,
             output = paddle.static.nn.layer_norm(input=x, begin_norm_axis=1)
             print(output.shape)  # [8, 32, 32]
     """
-    assert _non_static_mode(
-    ) is not True, "please use LayerNorm instead of layer_norm in dygraph mode!"
+    assert (
+        _non_static_mode() is not True
+    ), "please use LayerNorm instead of layer_norm in dygraph mode!"
     helper = LayerHelper('layer_norm', **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'layer_norm')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'layer_norm'
+    )
     dtype = helper.input_dtype()
 
     # create intput and parameters
@@ -3718,57 +4013,65 @@ def layer_norm(input,
     input_shape = input.shape
     param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
     if scale:
-        assert param_attr is not False, "param_attr should not be False when using scale."
-        scale = helper.create_parameter(attr=helper.param_attr,
-                                        shape=param_shape,
-                                        dtype=dtype,
-                                        default_initializer=Constant(1.0))
+        assert (
+            param_attr is not False
+        ), "param_attr should not be False when using scale."
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0),
+        )
         inputs['Scale'] = scale
     else:
         if param_attr:
             warnings.warn("param_attr is only available with scale is True.")
     if shift:
-        assert bias_attr is not False, "bias_attr should not be False when using shift."
-        bias = helper.create_parameter(attr=helper.bias_attr,
-                                       shape=param_shape,
-                                       dtype=dtype,
-                                       is_bias=True)
+        assert (
+            bias_attr is not False
+        ), "bias_attr should not be False when using shift."
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+        )
         inputs['Bias'] = bias
     else:
         if bias_attr:
             warnings.warn("bias_attr is only available with shift is True.")
 
     # create output
-    mean_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                         stop_gradient=True)
-    variance_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                             stop_gradient=True)
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     layer_norm_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type="layer_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": layer_norm_out,
-                         "Mean": mean_out,
-                         "Variance": variance_out,
-                     },
-                     attrs={
-                         "epsilon": epsilon,
-                         "begin_norm_axis": begin_norm_axis
-                     })
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon, "begin_norm_axis": begin_norm_axis},
+    )
 
     return helper.append_activation(layer_norm_out)
 
 
 @templatedoc()
-def group_norm(input,
-               groups,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               act=None,
-               data_layout='NCHW',
-               name=None):
+def group_norm(
+    input,
+    groups,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    act=None,
+    data_layout='NCHW',
+    name=None,
+):
     """
     :api_attr: Static Graph
 
@@ -3806,15 +4109,16 @@ def group_norm(input,
 
             import paddle
             paddle.enable_static()
-            
+
             data = paddle.static.data(name='data', shape=[2, 8, 32, 32], dtype='float32')
             x = paddle.static.nn.group_norm(input=data, groups=4)
             print(x.shape) # [2, 8, 32, 32]
     """
     helper = LayerHelper('group_norm', **locals())
     dtype = helper.input_dtype()
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'group_norm')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'group_norm'
+    )
     # create intput and parameters
     inputs = {'X': input}
     input_shape = input.shape
@@ -3825,20 +4129,23 @@ def group_norm(input,
     if data_layout != 'NCHW' and data_layout != 'NHWC':
         raise ValueError(
             "Param(data_layout) of Op(fluid.layers.group_norm) got wrong value: received "
-            + data_layout + " but only NCHW or NHWC supported.")
+            + data_layout
+            + " but only NCHW or NHWC supported."
+        )
     channel_num = input_shape[1] if data_layout == 'NCHW' else input_shape[-1]
     param_shape = [channel_num]
     if param_attr:
-        scale = helper.create_parameter(attr=helper.param_attr,
-                                        shape=param_shape,
-                                        dtype=dtype,
-                                        default_initializer=Constant(1.0))
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0),
+        )
         inputs['Scale'] = scale
     if bias_attr:
-        bias = helper.create_parameter(attr=helper.bias_attr,
-                                       shape=param_shape,
-                                       dtype=dtype,
-                                       is_bias=True)
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+        )
         inputs['Bias'] = bias
 
     # create output
@@ -3846,18 +4153,20 @@ def group_norm(input,
     variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
     group_norm_out = helper.create_variable(dtype=dtype)
 
-    helper.append_op(type="group_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": group_norm_out,
-                         "Mean": mean_out,
-                         "Variance": variance_out,
-                     },
-                     attrs={
-                         "epsilon": epsilon,
-                         "groups": groups,
-                         "data_layout": data_layout
-                     })
+    helper.append_op(
+        type="group_norm",
+        inputs=inputs,
+        outputs={
+            "Y": group_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={
+            "epsilon": epsilon,
+            "groups": groups,
+            "data_layout": data_layout,
+        },
+    )
 
     return helper.append_activation(group_norm_out)
 
@@ -3926,8 +4235,9 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
             print(x.shape) # [2, 8, 32, 32]
     """
     helper = LayerHelper('spectral_norm', **locals())
-    check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                             'spectral_norm')
+    check_variable_and_dtype(
+        weight, 'weight', ['float32', 'float64'], 'spectral_norm'
+    )
     check_type(dim, 'dim', int, 'spectral_norm')
     check_type(power_iters, 'power_iters', int, 'spectral_norm')
     check_type(eps, 'eps', float, 'spectral_norm')
@@ -3936,21 +4246,27 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
     # create intput and parameters
     input_shape = weight.shape
     assert weight.numel() > 0, "Any dimension of input cannot be equal to 0."
-    assert dim < len(input_shape), ("The input `dim` should be less than the "
-                                    "rank of `weight`, but received dim="
-                                    "{}".format(dim))
+    assert dim < len(input_shape), (
+        "The input `dim` should be less than the "
+        "rank of `weight`, but received dim="
+        "{}".format(dim)
+    )
     h = input_shape[dim]
     w = np.prod(input_shape) // h
 
-    u = helper.create_parameter(attr=ParamAttr(),
-                                shape=[h],
-                                dtype=dtype,
-                                default_initializer=Normal(0., 1.))
+    u = helper.create_parameter(
+        attr=ParamAttr(),
+        shape=[h],
+        dtype=dtype,
+        default_initializer=Normal(0.0, 1.0),
+    )
     u.stop_gradient = True
-    v = helper.create_parameter(attr=ParamAttr(),
-                                shape=[w],
-                                dtype=dtype,
-                                default_initializer=Normal(0., 1.))
+    v = helper.create_parameter(
+        attr=ParamAttr(),
+        shape=[w],
+        dtype=dtype,
+        default_initializer=Normal(0.0, 1.0),
+    )
     v.stop_gradient = True
 
     if in_dygraph_mode():
@@ -3963,34 +4279,38 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
     # create output
     out = helper.create_variable(dtype=dtype)
 
-    helper.append_op(type="spectral_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Out": out,
-                     },
-                     attrs={
-                         "dim": dim,
-                         "power_iters": power_iters,
-                         "eps": eps,
-                     })
+    helper.append_op(
+        type="spectral_norm",
+        inputs=inputs,
+        outputs={
+            "Out": out,
+        },
+        attrs={
+            "dim": dim,
+            "power_iters": power_iters,
+            "eps": eps,
+        },
+    )
 
     return out
 
 
-def conv2d_transpose(input,
-                     num_filters,
-                     output_size=None,
-                     filter_size=None,
-                     padding=0,
-                     stride=1,
-                     dilation=1,
-                     groups=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_cudnn=True,
-                     act=None,
-                     name=None,
-                     data_format='NCHW'):
+def conv2d_transpose(
+    input,
+    num_filters,
+    output_size=None,
+    filter_size=None,
+    padding=0,
+    stride=1,
+    dilation=1,
+    groups=None,
+    param_attr=None,
+    bias_attr=None,
+    use_cudnn=True,
+    act=None,
+    name=None,
+    data_format='NCHW',
+):
     r"""
     :api_attr: Static Graph
 
@@ -4146,20 +4466,29 @@ def conv2d_transpose(input,
           conv2d_transpose = paddle.static.nn.conv2d_transpose(input=data, num_filters=2, filter_size=3)
           print(conv2d_transpose.shape) # [-1, 2, 34, 34]
     """
-    assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
+    assert (
+        param_attr is not False
+    ), "param_attr should not be False in conv2d_transpose."
     if len(input.shape) != 4:
-        raise ValueError("Input size should be 4, "
-                         "but received {}".format(len(input.shape)))
+        raise ValueError(
+            "Input size should be 4, "
+            "but received {}".format(len(input.shape))
+        )
 
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Attr(data_format) of Op(fluid.layers.conv2d_transpose) got wrong value: received "
-            + data_format + " but only NCHW or NHWC supported.")
+            + data_format
+            + " but only NCHW or NHWC supported."
+        )
 
     input_channel = input.shape[1] if data_format == 'NCHW' else input.shape[-1]
     op_type = 'conv2d_transpose'
-    if (input_channel == groups and num_filters == input_channel
-            and not use_cudnn):
+    if (
+        input_channel == groups
+        and num_filters == input_channel
+        and not use_cudnn
+    ):
         op_type = 'depthwise_conv2d_transpose'
 
     helper = LayerHelper(op_type, **locals())
@@ -4173,7 +4502,6 @@ def conv2d_transpose(input,
         raise ValueError("use_cudnn should be True or False")
 
     def _update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -4184,14 +4512,16 @@ def conv2d_transpose(input,
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:4]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
                 if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:3]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 4, 'padding')
@@ -4205,8 +4535,9 @@ def conv2d_transpose(input,
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding)
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0, 0, 0, 0]
@@ -4226,44 +4557,63 @@ def conv2d_transpose(input,
     elif isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     elif isinstance(output_size, Variable):
-        check_dtype(output_size.dtype, 'output_size', ['int32', 'int64'],
-                    'conv2d_transpose')
-        if len(output_size.shape) == 1 and (output_size.shape[0] == 1
-                                            or output_size.shape[0] == 2):
+        check_dtype(
+            output_size.dtype,
+            'output_size',
+            ['int32', 'int64'],
+            'conv2d_transpose',
+        )
+        if len(output_size.shape) == 1 and (
+            output_size.shape[0] == 1 or output_size.shape[0] == 2
+        ):
             if output_size.shape[0] == 1:
                 output_size = [output_size, output_size]
         else:
             raise ValueError("output_size must contain one or two integers.")
     else:
         raise ValueError(
-            "output_size should be int, list[int] or tuple[int] or Tensor")
+            "output_size should be int, list[int] or tuple[int] or Tensor"
+        )
 
     if filter_size is None:
         if output_size is []:
             raise ValueError("output_size must be set when filter_size is None")
         if not _non_static_mode():
-            if isinstance(output_size,
-                          Variable) or utils._contain_var(output_size):
+            if isinstance(output_size, Variable) or utils._contain_var(
+                output_size
+            ):
                 raise ValueError(
                     "filter_size should not be None when output_size is Variable or contain Variable in static mode."
                 )
         else:
             output_size = utils.convert_shape_to_list(output_size)
             if len(output_size) == 1:
-                output_size = utils.convert_to_list(output_size[0], 2,
-                                                    'output_size')
+                output_size = utils.convert_to_list(
+                    output_size[0], 2, 'output_size'
+                )
 
         h_in = input.shape[2] if data_format == 'NCHW' else input.shape[1]
         w_in = input.shape[3] if data_format == 'NCHW' else input.shape[2]
 
-        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + padding[0] +
-                         padding[1] - 1) // dilation[0] + 1
-        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + padding[2] +
-                         padding[3] - 1) // dilation[1] + 1
+        filter_size_h = (
+            output_size[0]
+            - (h_in - 1) * stride[0]
+            + padding[0]
+            + padding[1]
+            - 1
+        ) // dilation[0] + 1
+        filter_size_w = (
+            output_size[1]
+            - (w_in - 1) * stride[1]
+            + padding[2]
+            + padding[3]
+            - 1
+        ) // dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
     else:
-        filter_size = utils.convert_to_list(filter_size, 2,
-                                            'conv2d_transpose.filter_size')
+        filter_size = utils.convert_to_list(
+            filter_size, 2, 'conv2d_transpose.filter_size'
+        )
 
     if len(padding) == 4 and utils._is_symmetric_padding(padding, 2):
         padding = [padding[0], padding[2]]
@@ -4273,31 +4623,31 @@ def conv2d_transpose(input,
     elif groups <= 0:
         raise ValueError(
             "the groups of input must be greater than 0, "
-            "but received the groups of input is {}".format(groups))
+            "but received the groups of input is {}".format(groups)
+        )
 
     filter_shape = [input_channel, num_filters // groups] + filter_size
 
-    img_filter = helper.create_parameter(dtype=input.dtype,
-                                         shape=filter_shape,
-                                         attr=helper.param_attr)
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr
+    )
 
     pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type=op_type,
-                     inputs={
-                         'Input': [input],
-                         'Filter': [img_filter]
-                     },
-                     outputs={'Output': pre_bias},
-                     attrs={
-                         'output_size': output_size,
-                         'strides': stride,
-                         'paddings': padding,
-                         'padding_algorithm': padding_algorithm,
-                         'dilations': dilation,
-                         'groups': groups,
-                         'use_cudnn': use_cudnn,
-                         'data_format': data_format
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'Input': [input], 'Filter': [img_filter]},
+        outputs={'Output': pre_bias},
+        attrs={
+            'output_size': output_size,
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': data_format,
+        },
+    )
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -4307,20 +4657,22 @@ def conv2d_transpose(input,
     return out
 
 
-def conv3d_transpose(input,
-                     num_filters,
-                     output_size=None,
-                     filter_size=None,
-                     padding=0,
-                     stride=1,
-                     dilation=1,
-                     groups=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_cudnn=True,
-                     act=None,
-                     name=None,
-                     data_format='NCDHW'):
+def conv3d_transpose(
+    input,
+    num_filters,
+    output_size=None,
+    filter_size=None,
+    padding=0,
+    stride=1,
+    dilation=1,
+    groups=None,
+    param_attr=None,
+    bias_attr=None,
+    use_cudnn=True,
+    act=None,
+    name=None,
+    data_format='NCDHW',
+):
     r"""
     :api_attr: Static Graph
 
@@ -4484,11 +4836,15 @@ def conv3d_transpose(input,
           output = exe.run(feed={"data": x}, fetch_list=[res])
           print(output)
     """
-    assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
+    assert (
+        param_attr is not False
+    ), "param_attr should not be False in conv3d_transpose."
     if data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
             "Param(data_format) of Op(fluid.layers.conv3d_transpose) got wrong value: received "
-            + data_format + " but only NCDHW or NDHWC supported.")
+            + data_format
+            + " but only NCDHW or NDHWC supported."
+        )
 
     l_type = "conv3d_transpose"
     helper = LayerHelper(l_type, **locals())
@@ -4496,9 +4852,13 @@ def conv3d_transpose(input,
         raise TypeError("Input of conv3d_transpose must be Variable")
     if len(input.shape) != 5:
         raise ValueError(
-            "Input should be 5D tensor, but received input with the shape of {}"
-            .format(input.shape))
-    input_channel = input.shape[1] if data_format == 'NCDHW' else input.shape[-1]
+            "Input should be 5D tensor, but received input with the shape of {}".format(
+                input.shape
+            )
+        )
+    input_channel = (
+        input.shape[1] if data_format == 'NCDHW' else input.shape[-1]
+    )
 
     stride = utils.convert_to_list(stride, 3, 'stride')
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
@@ -4507,7 +4867,6 @@ def conv3d_transpose(input,
         raise ValueError("use_cudnn should be True or False")
 
     def _update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -4518,14 +4877,16 @@ def conv3d_transpose(input,
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:5]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
                 if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:4]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 6, 'padding')
@@ -4536,8 +4897,12 @@ def conv3d_transpose(input,
         else:
             padding = utils.convert_to_list(padding, 3, 'padding')
             padding = [
-                padding[0], padding[0], padding[1], padding[1], padding[2],
-                padding[2]
+                padding[0],
+                padding[0],
+                padding[1],
+                padding[1],
+                padding[2],
+                padding[2],
             ]
         return padding
 
@@ -4546,8 +4911,9 @@ def conv3d_transpose(input,
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding)
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0, 0, 0, 0, 0, 0]
@@ -4567,16 +4933,32 @@ def conv3d_transpose(input,
         h_in = input.shape[3] if data_format == 'NCDHW' else input.shape[2]
         w_in = input.shape[4] if data_format == 'NCDHW' else input.shape[3]
 
-        filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + padding[0] +
-                         padding[1] - 1) // dilation[0] + 1
-        filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + padding[2] +
-                         padding[3] - 1) // dilation[1] + 1
-        filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + padding[4] +
-                         padding[5] - 1) // dilation[2] + 1
+        filter_size_d = (
+            output_size[0]
+            - (d_in - 1) * stride[0]
+            + padding[0]
+            + padding[1]
+            - 1
+        ) // dilation[0] + 1
+        filter_size_h = (
+            output_size[1]
+            - (h_in - 1) * stride[1]
+            + padding[2]
+            + padding[3]
+            - 1
+        ) // dilation[1] + 1
+        filter_size_w = (
+            output_size[2]
+            - (w_in - 1) * stride[2]
+            + padding[4]
+            + padding[5]
+            - 1
+        ) // dilation[2] + 1
         filter_size = [filter_size_d, filter_size_h, filter_size_w]
     else:
-        filter_size = utils.convert_to_list(filter_size, 3,
-                                            'conv3d_transpose.filter_size')
+        filter_size = utils.convert_to_list(
+            filter_size, 3, 'conv3d_transpose.filter_size'
+        )
 
     if len(padding) == 6 and utils._is_symmetric_padding(padding, 3):
         padding = [padding[0], padding[2], padding[4]]
@@ -4591,18 +4973,22 @@ def conv3d_transpose(input,
     groups = 1 if groups is None else groups
     if groups <= 0:
         raise ValueError(
-            "the groups of conv3d_transpose should be greater than 0. Received groups: {}"
-            .format(groups))
+            "the groups of conv3d_transpose should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_filters % groups != 0:
         raise ValueError(
             "Attr(num_filters) must be divisible by groups,"
             "Received: Attr(num_filters) is {}, the groups is {}".format(
-                num_filters, groups))
+                num_filters, groups
+            )
+        )
 
     filter_shape = [input_channel, num_filters // groups] + filter_size
-    img_filter = helper.create_parameter(dtype=input.dtype,
-                                         shape=filter_shape,
-                                         attr=helper.param_attr)
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr
+    )
 
     if data_format == 'NCDHW':
         data_format = 'NCHW'
@@ -4610,22 +4996,21 @@ def conv3d_transpose(input,
         data_format = 'NHWC'
 
     pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type=l_type,
-                     inputs={
-                         'Input': [input],
-                         'Filter': [img_filter]
-                     },
-                     outputs={'Output': pre_bias},
-                     attrs={
-                         'output_size': output_size,
-                         'strides': stride,
-                         'paddings': padding,
-                         'padding_algorithm': padding_algorithm,
-                         'dilations': dilation,
-                         'groups': groups,
-                         'use_cudnn': use_cudnn,
-                         'data_format': data_format
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={'Input': [input], 'Filter': [img_filter]},
+        outputs={'Output': pre_bias},
+        attrs={
+            'output_size': output_size,
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': data_format,
+        },
+    )
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -4691,37 +5076,47 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
         dim = [dim]
 
     if in_dygraph_mode():
-        reduce_all = True if dim == None or dim == [] or len(dim) == len(
-            input.shape) else False
+        reduce_all = (
+            True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False
+        )
         dim = dim if dim != None and dim != [] else [0]
         if reduce_all:
             return _C_ops.sum(input, [], None, keep_dim)
         else:
             return _C_ops.sum(input, dim, None, keep_dim)
     elif _in_legacy_dygraph():
-        reduce_all = True if dim == None or dim == [] or len(dim) == len(
-            input.shape) else False
+        reduce_all = (
+            True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False
+        )
         dim = dim if dim != None and dim != [] else [0]
-        return _legacy_C_ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                        'reduce_all', reduce_all)
+        return _legacy_C_ops.reduce_sum(
+            input, 'dim', dim, 'keep_dim', keep_dim, 'reduce_all', reduce_all
+        )
     attrs = {
-        'dim':
-        dim if dim != None and dim != [] else [0],
-        'keep_dim':
-        keep_dim,
-        'reduce_all':
-        True
-        if dim == None or dim == [] or len(dim) == len(input.shape) else False
+        'dim': dim if dim != None and dim != [] else [0],
+        'keep_dim': keep_dim,
+        'reduce_all': True
+        if dim == None or dim == [] or len(dim) == len(input.shape)
+        else False,
     }
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'reduce_sum')
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'reduce_sum',
+    )
     helper = LayerHelper('reduce_sum', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(type='reduce_sum',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs=attrs,
+    )
     return out
 
 
@@ -4839,18 +5234,18 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
     if in_dygraph_mode():
         return _C_ops.max(input, dim if dim != None else [], keep_dim)
 
-    helper.append_op(type='reduce_max',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_max',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -4911,18 +5306,18 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
     if in_dygraph_mode():
         return _C_ops.min(input, dim if dim != None else [], keep_dim)
 
-    helper.append_op(type='reduce_min',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_min',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -4983,30 +5378,37 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             dim = [dim]
         else:
             raise TypeError(
-                "The type of axis must be int, list or tuple, but received {}".
-                format(type(dim)))
+                "The type of axis must be int, list or tuple, but received {}".format(
+                    type(dim)
+                )
+            )
     if in_dygraph_mode():
         return _C_ops.reduce_prod(
-            input, dim if dim != None and dim != [] else [0], keep_dim, True if
-            dim == None or dim == [] or len(dim) == len(input.shape) else False)
+            input,
+            dim if dim != None and dim != [] else [0],
+            keep_dim,
+            True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        )
 
     helper = LayerHelper('reduce_prod', **locals())
-    check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'reduce_prod')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_prod'
+    )
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(type='reduce_prod',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_prod',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -5063,18 +5465,18 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
     check_variable_and_dtype(input, 'input', ('bool'), 'reduce_all')
     helper = LayerHelper('reduce_all', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(type='reduce_all',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_all',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -5126,18 +5528,18 @@ def reduce_any(input, dim=None, keep_dim=False, name=None):
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
-    helper.append_op(type='reduce_any',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_any',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -5147,15 +5549,15 @@ def split(input, num_or_sections, dim=-1, name=None):
 
     Args:
         input (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
-        num_or_sections (int|list|tuple): If ``num_or_sections`` is int, then the ``num_or_sections`` 
+        num_or_sections (int|list|tuple): If ``num_or_sections`` is int, then the ``num_or_sections``
             indicates the number of equal sized sub-Tensors that the ``input``
-            will be divided into. If ``num_or_sections`` is a list or tuple, the length of it 
+            will be divided into. If ``num_or_sections`` is a list or tuple, the length of it
             indicates the number of sub-Tensors and the elements in it indicate the sizes of sub-Tensors'
             dimension orderly. The length of the list mustn't be larger than the ``input`` 's size of specified dim.
         dim (int|Tensor, optional): The dimension along which to split, it can be a scalar with type ``int`` or
             a ``Tensor`` with shape [1] and data type ``int32`` or ``int64``. If :math:`dim < 0`,
             the dimension to split along is :math:`rank(input) + dim`. Default is -1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property. 
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
@@ -5184,7 +5586,7 @@ def split(input, num_or_sections, dim=-1, name=None):
             # out0.shape [3, 2, 5]
             # out1.shape [3, 3, 5]
             # out2.shape [3, 4, 5]
-            
+
             # dim is negative, the real dim is (rank(input) + axis) which real
             # value is 1.
             out0, out1, out2 = fluid.layers.split(input, num_or_sections=3, dim=-2)
@@ -5212,15 +5614,17 @@ def split(input, num_or_sections, dim=-1, name=None):
             if utils._contain_var(num_or_sections):
                 for index, item in enumerate(num_or_sections):
                     if isinstance(item, Variable):
-                        num_or_sections[index] = num_or_sections[index].numpy(
-                        )[0]
+                        num_or_sections[index] = num_or_sections[index].numpy()[
+                            0
+                        ]
                 attrs += ('sections', list(num_or_sections))
             else:
                 attrs += ('sections', list(num_or_sections))
         else:
             raise TypeError(
                 "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
-                "received %s." % (type(num_or_sections)))
+                "received %s." % (type(num_or_sections))
+            )
         if in_dygraph_mode():
             if isinstance(num_or_sections, int):
                 return _C_ops.split_with_num(input, num_or_sections, dim)
@@ -5232,8 +5636,11 @@ def split(input, num_or_sections, dim=-1, name=None):
             return out
 
     check_variable_and_dtype(
-        input, 'input',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'split')
+        input,
+        'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'split',
+    )
     check_type(num_or_sections, 'num_or_sections', (list, int, tuple), 'split')
     check_type(dim, 'dim', (int, Variable), 'split')
     if isinstance(dim, Variable):
@@ -5253,19 +5660,18 @@ def split(input, num_or_sections, dim=-1, name=None):
                 dim_size.stop_gradient = True
                 tensor_list.append(dim_size)
             else:
-                assert (isinstance(dim_size, int))
+                assert isinstance(dim_size, int)
                 if dim_size == -1:
                     assert unk_dim_idx == -1, (
                         "Only one value of 'num_or_section' in split can "
-                        "be -1. But received num_or_section[%d] is also -1." %
-                        idx)
+                        "be -1. But received num_or_section[%d] is also -1."
+                        % idx
+                    )
                     unk_dim_idx = idx
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1],
-                              'int32',
-                              dim_size,
-                              force_cpu=True,
-                              out=temp_out)
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out
+                )
                 tensor_list.append(temp_out)
         return tensor_list
 
@@ -5280,31 +5686,37 @@ def split(input, num_or_sections, dim=-1, name=None):
     if isinstance(num_or_sections, int):
         assert num_or_sections > 1, 'num_or_sections must be more than 1.'
         if isinstance(dim, int) and input_shape[dim] > 0:
-            assert input_shape[dim] % num_or_sections ==0, \
-                "The input's size along the split dimension " \
-                "must be evenly divisible by Attr(num_or_sections). " \
-                "But %d is not evenly divisible by %d. " % (num_or_sections,input_shape[dim])
+            assert input_shape[dim] % num_or_sections == 0, (
+                "The input's size along the split dimension "
+                "must be evenly divisible by Attr(num_or_sections). "
+                "But %d is not evenly divisible by %d. "
+                % (num_or_sections, input_shape[dim])
+            )
         num = num_or_sections
     else:
         if isinstance(dim, int) and input_shape[dim] > 0:
-            assert len(num_or_sections) <= input_shape[
-                dim], 'len(num_or_sections) must not be more than input.shape[dim].'
+            assert (
+                len(num_or_sections) <= input_shape[dim]
+            ), 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
         attrs['sections'] = list(
-            map(lambda ele: -1
-                if isinstance(ele, Variable) else ele, num_or_sections))
+            map(
+                lambda ele: -1 if isinstance(ele, Variable) else ele,
+                num_or_sections,
+            )
+        )
         if utils._contain_var(num_or_sections):
             inputs['SectionsTensorList'] = _get_SectionsTensorList(
-                num_or_sections)
+                num_or_sections
+            )
 
     outs = [
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(type='split',
-                     inputs=inputs,
-                     outputs={'Out': outs},
-                     attrs=attrs)
+    helper.append_op(
+        type='split', inputs=inputs, outputs={'Out': outs}, attrs=attrs
+    )
     return outs
 
 
@@ -5355,8 +5767,9 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
         if in_dygraph_mode():
             out, _ = _C_ops.norm(x, 1 if axis is None else axis, epsilon, False)
         elif _in_legacy_dygraph():
-            _, out = _legacy_C_ops.norm(x, 'axis', 1 if axis is None else axis,
-                                        'epsilon', epsilon)
+            _, out = _legacy_C_ops.norm(
+                x, 'axis', 1 if axis is None else axis, 'epsilon', epsilon
+            )
         return out
 
     check_variable_and_dtype(x, "X", ("float16", "float32", "float64"), "norm")
@@ -5364,16 +5777,15 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     helper = LayerHelper("l2_normalize", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     norm = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="norm",
-                     inputs={"X": x},
-                     outputs={
-                         "Out": out,
-                         "Norm": norm
-                     },
-                     attrs={
-                         "axis": 1 if axis is None else axis,
-                         "epsilon": epsilon,
-                     })
+    helper.append_op(
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out, "Norm": norm},
+        attrs={
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
+        },
+    )
     return out
 
 
@@ -5454,16 +5866,25 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     """
     if _non_static_mode():
         out = _varbase_creator(dtype=x.dtype)
-        _legacy_C_ops.matmul(x, y, out, 'transpose_X', transpose_x,
-                             'transpose_Y', transpose_y, 'alpha', float(alpha))
+        _legacy_C_ops.matmul(
+            x,
+            y,
+            out,
+            'transpose_X',
+            transpose_x,
+            'transpose_Y',
+            transpose_y,
+            'alpha',
+            float(alpha),
+        )
         return out
 
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(val, name,
-                                     ['float16', 'float32', 'float64'],
-                                     'matmul')
+            check_variable_and_dtype(
+                val, name, ['float16', 'float32', 'float64'], 'matmul'
+            )
         x_shape = list(x.shape)
         y_shape = list(y.shape)
         if len(x_shape) == 1:
@@ -5477,11 +5898,12 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
         if transpose_y:
             y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
         if x_shape[-1] != y_shape[-2]:
-            assert (x_shape[-1] == -1) or (y_shape[-2] == -1),                         \
-                "After performing an optional transpose, Input X's width should be "   \
-                "equal to Y's width for multiplication "                               \
-                "prerequisites. But received X's shape: %s, Y's shape: %s\n" %         \
-                (x_shape, y_shape)
+            assert (x_shape[-1] == -1) or (y_shape[-2] == -1), (
+                "After performing an optional transpose, Input X's width should be "
+                "equal to Y's width for multiplication "
+                "prerequisites. But received X's shape: %s, Y's shape: %s\n"
+                % (x_shape, y_shape)
+            )
 
         if len(y_shape) > 2 and len(x_shape) > 2:
             for i, dim_x in enumerate(x_shape[:-2]):
@@ -5493,7 +5915,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
                         "When the matrix is larger than 2 dimensions, the higher "
                         "dimensional values of the two matrices need to be equal. "
                         "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
-                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape))
+                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape)
+                    )
 
     attrs = {
         'transpose_X': transpose_x,
@@ -5505,21 +5928,20 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
 
     helper = LayerHelper('matmul', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='matmul',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='matmul',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs=attrs,
+    )
     return out
 
 
 def topk(input, k, name=None):
     """
     :alias_main: paddle.topk
-	:alias: paddle.topk,paddle.tensor.topk,paddle.tensor.search.topk
-	:old_api: paddle.fluid.layers.topk
+        :alias: paddle.topk,paddle.tensor.topk,paddle.tensor.search.topk
+        :old_api: paddle.fluid.layers.topk
 
     This OP is used to find values and indices of the k largest entries
     for the last dimension.
@@ -5604,23 +6026,20 @@ def topk(input, k, name=None):
     values = helper.create_variable_for_type_inference(dtype=input.dtype)
     indices = helper.create_variable_for_type_inference(dtype="int64")
 
-    helper.append_op(type="top_k",
-                     inputs=inputs,
-                     outputs={
-                         "Out": [values],
-                         "Indices": [indices]
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type="top_k",
+        inputs=inputs,
+        outputs={"Out": [values], "Indices": [indices]},
+        attrs=attrs,
+    )
     values.stop_gradient = True
     indices.stop_gradient = True
     return values, indices
 
 
-def ctc_greedy_decoder(input,
-                       blank,
-                       input_length=None,
-                       padding_value=0,
-                       name=None):
+def ctc_greedy_decoder(
+    input, blank, input_length=None, padding_value=0, name=None
+):
     r"""
     This op is used to decode sequences by greedy policy by the following steps:
 
@@ -5746,8 +6165,9 @@ def ctc_greedy_decoder(input,
                             input_length=x_pad_len)
 
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'ctc_greedy_decoder')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'ctc_greedy_decoder'
+    )
 
     helper = LayerHelper("ctc_greedy_decoder", **locals())
     _, topk_indices = topk(input, k=1)
@@ -5756,32 +6176,27 @@ def ctc_greedy_decoder(input,
     ctc_out = helper.create_variable_for_type_inference(dtype="int64")
 
     if input_length is None:
-        helper.append_op(type="ctc_align",
-                         inputs={"Input": [topk_indices]},
-                         outputs={"Output": [ctc_out]},
-                         attrs={
-                             "merge_repeated": True,
-                             "blank": blank
-                         })
+        helper.append_op(
+            type="ctc_align",
+            inputs={"Input": [topk_indices]},
+            outputs={"Output": [ctc_out]},
+            attrs={"merge_repeated": True, "blank": blank},
+        )
         return ctc_out
     else:
         ctc_out_len = helper.create_variable_for_type_inference(dtype="int64")
         ctc_input = squeeze(topk_indices, [2])
 
-        helper.append_op(type="ctc_align",
-                         inputs={
-                             "Input": [ctc_input],
-                             "InputLength": [input_length]
-                         },
-                         outputs={
-                             "Output": [ctc_out],
-                             "OutputLength": [ctc_out_len]
-                         },
-                         attrs={
-                             "merge_repeated": True,
-                             "blank": blank,
-                             "padding_value": padding_value
-                         })
+        helper.append_op(
+            type="ctc_align",
+            inputs={"Input": [ctc_input], "InputLength": [input_length]},
+            outputs={"Output": [ctc_out], "OutputLength": [ctc_out_len]},
+            attrs={
+                "merge_repeated": True,
+                "blank": blank,
+                "padding_value": padding_value,
+            },
+        )
         return ctc_out, ctc_out_len
 
 
@@ -5842,10 +6257,21 @@ def transpose(x, perm, name=None):
             out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'transpose')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'transpose',
+    )
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
         perm = list(perm)
@@ -5854,34 +6280,37 @@ def transpose(x, perm, name=None):
             "Input(perm) is the permutation of dimensions of Input(x), "
             "its length should be equal to dimensions of Input(x), "
             "but received dimension of Input(x) is %s, "
-            "the length of Input(perm) is %s." % (len(x.shape), len(perm)))
+            "the length of Input(perm) is %s." % (len(x.shape), len(perm))
+        )
     for idx, dim in enumerate(perm):
         if dim >= len(x.shape):
             raise ValueError(
                 "Each element in Input(perm) should be less than Input(x)'s dimension, "
                 "but %d-th element in Input(perm) is %d which exceeds Input(x)'s "
-                "dimension %d." % (idx, perm[idx], len(x.shape)))
+                "dimension %d." % (idx, perm[idx], len(x.shape))
+            )
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='transpose2',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'XShape': [x_shape]
-                     },
-                     attrs={'axis': perm})
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'XShape': [x_shape]},
+        attrs={'axis': perm},
+    )
     return out
 
 
-def im2sequence(input,
-                filter_size=1,
-                stride=1,
-                padding=0,
-                input_image_size=None,
-                out_stride=1,
-                name=None):
+def im2sequence(
+    input,
+    filter_size=1,
+    stride=1,
+    padding=0,
+    input_image_size=None,
+    out_stride=1,
+    name=None,
+):
     r"""
     :api_attr: Static Graph
 
@@ -5995,8 +6424,9 @@ def im2sequence(input,
 
 
     """
-    assert not _non_static_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
+    assert (
+        not _non_static_mode()
+    ), "sequence layer is not supported in dygraph mode yet."
 
     check_variable_and_dtype(input, 'input', ['float32'], 'im2sequence')
 
@@ -6018,10 +6448,9 @@ def im2sequence(input,
         attrs["out_stride"] = out_stride
     helper = LayerHelper('im2sequence', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(type='im2sequence',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -6061,16 +6490,15 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     check_variable_and_dtype(input, 'input', ['float32'], 'row_conv')
     dtype = helper.input_dtype()
     filter_shape = [future_context_size + 1, input.shape[-1]]
-    filter_param = helper.create_parameter(attr=helper.param_attr,
-                                           shape=filter_shape,
-                                           dtype=dtype)
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype
+    )
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='row_conv',
-                     inputs={
-                         'X': [input],
-                         'Filter': [filter_param]
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='row_conv',
+        inputs={'X': [input], 'Filter': [filter_param]},
+        outputs={'Out': [out]},
+    )
     return helper.append_activation(out)
 
 
@@ -6136,20 +6564,23 @@ def multiplex(inputs, index, name=None):
     check_type(inputs, 'inputs', (list), 'multiplex')
     if len(inputs) < 2:
         raise ValueError(
-            "inputs should be a list object with at least 2 elements.")
+            "inputs should be a list object with at least 2 elements."
+        )
     for id, x in enumerate(inputs):
-        check_variable_and_dtype(x, 'input[' + str(id) + ']',
-                                 ['float32', 'float64', 'int32', 'int64'],
-                                 'multiplex')
+        check_variable_and_dtype(
+            x,
+            'input[' + str(id) + ']',
+            ['float32', 'float64', 'int32', 'int64'],
+            'multiplex',
+        )
     check_variable_and_dtype(index, "index", ['int32', 'int64'], 'multiplex')
 
     out = helper.create_variable_for_type_inference(inputs[0].dtype)
-    helper.append_op(type='multiplex',
-                     inputs={
-                         'X': inputs,
-                         'Ids': index
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='multiplex',
+        inputs={'X': inputs, 'Ids': index},
+        outputs={'Out': [out]},
+    )
     return out
 
 
@@ -6216,18 +6647,17 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
 
     diff = helper.create_variable_for_type_inference(dtype=x.dtype)
     loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='smooth_l1_loss',
-                     inputs={
-                         'X': x,
-                         'Y': y,
-                         'InsideWeight': inside_weight,
-                         'OutsideWeight': outside_weight
-                     },
-                     outputs={
-                         'Diff': diff,
-                         'Out': loss
-                     },
-                     attrs={'sigma': sigma if sigma is not None else 1.0})
+    helper.append_op(
+        type='smooth_l1_loss',
+        inputs={
+            'X': x,
+            'Y': y,
+            'InsideWeight': inside_weight,
+            'OutsideWeight': outside_weight,
+        },
+        outputs={'Diff': diff, 'Out': loss},
+        attrs={'sigma': sigma if sigma is not None else 1.0},
+    )
     return loss
 
 
@@ -6319,10 +6749,12 @@ def one_hot(input, depth, allow_out_of_range=False):
         if isinstance(depth, Variable):
             depth = depth.numpy()
             assert depth.shape == (
-                1, ), "depth of type Variable should have shape [1]"
+                1,
+            ), "depth of type Variable should have shape [1]"
             depth = depth.item(0)
-        out = _legacy_C_ops.one_hot(input, 'depth', depth, 'allow_out_of_range',
-                                    allow_out_of_range)
+        out = _legacy_C_ops.one_hot(
+            input, 'depth', depth, 'allow_out_of_range', allow_out_of_range
+        )
         out.stop_gradient = True
         return out
 
@@ -6339,10 +6771,9 @@ def one_hot(input, depth, allow_out_of_range=False):
         depth.stop_gradient = True
         inputs = {'X': input, 'depth_tensor': depth}
         attrs = {'allow_out_of_range': allow_out_of_range}
-    helper.append_op(type="one_hot",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': one_hot_out})
+    helper.append_op(
+        type="one_hot", inputs=inputs, attrs=attrs, outputs={'Out': one_hot_out}
+    )
     one_hot_out.stop_gradient = True
     return one_hot_out
 
@@ -6380,16 +6811,18 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         dtype='int64',
         shape=[1],
         persistable=True,
-        belong_to_optimizer=True)
+        belong_to_optimizer=True,
+    )
     if is_new_var:
-        helper.set_variable_initializer(counter,
-                                        initializer=Constant(value=begin - 1,
-                                                             force_cpu=True))
+        helper.set_variable_initializer(
+            counter, initializer=Constant(value=begin - 1, force_cpu=True)
+        )
         helper.main_program.global_block()._prepend_op(
             type='increment',
             inputs={'X': [counter]},
             outputs={'Out': [counter]},
-            attrs={'step': float(step)})
+            attrs={'step': float(step)},
+        )
         counter.stop_gradient = True
 
     return counter
@@ -6493,7 +6926,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     """
     if in_dygraph_mode():
         tmp_tensor_type = core.eager.Tensor
-        #TODO(zhiqiu): enable inplace in dygraph mode.
+        # TODO(zhiqiu): enable inplace in dygraph mode.
         if inplace:
             warnings.warn(
                 "Inplace on reshape is not allowed and will be discarded in dygraph mode currently."
@@ -6511,7 +6944,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
-                " got '{}.'".format(type(shape)))
+                " got '{}.'".format(type(shape))
+            )
 
         return dygraph_utils._append_activation_in_dygraph(out, act)
     else:
@@ -6533,14 +6967,26 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             else:
                 raise ValueError(
                     "shape must be an instance of `list`, `tuple` or `Variable`,"
-                    " got '{}.'".format(type(shape)))
+                    " got '{}.'".format(type(shape))
+                )
 
             return dygraph_utils._append_activation_in_dygraph(out, act)
 
-    check_variable_and_dtype(x, 'x', [
-        'float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'bool',
-        'uint16'
-    ], 'reshape')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int16',
+            'int32',
+            'int64',
+            'bool',
+            'uint16',
+        ],
+        'reshape',
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
     check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape')
 
@@ -6564,20 +7010,23 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                         "\t# z.shape is [-1, -1, 4]\n\n"
                         "    If your target shape in Reshape represents dynamic shape, "
                         "please turn it into a Tensor under @to_static. See above example for details."
-                        % dim_idx)
+                        % dim_idx
+                    )
                     unk_dim_idx = dim_idx
                 elif dim_size == 0:
                     assert dim_idx < len(x.shape), (
                         "The index of 0 in `shape` must be less than "
                         "the input tensor X's dimensions. "
-                        "But received shape[%d] = 0, X's dimensions = %d." %
-                        (dim_idx, len(x.shape)))
+                        "But received shape[%d] = 0, X's dimensions = %d."
+                        % (dim_idx, len(x.shape))
+                    )
                 else:
                     assert dim_size > 0, (
                         "Each dimension value of 'shape' in reshape must not "
                         "be negative except one unknown dimension. "
-                        "But received shape[%d] = %s." %
-                        (dim_idx, str(dim_size)))
+                        "But received shape[%d] = %s."
+                        % (dim_idx, str(dim_size))
+                    )
         return attrs_shape
 
     inputs = {"X": x}
@@ -6586,8 +7035,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
         shape.stop_gradient = True
         inputs["Shape"] = shape
     elif isinstance(shape, (list, tuple)):
-        assert len(shape) > 0, ("The size of 'shape' in reshape can't be zero, "
-                                "but received %s." % len(shape))
+        assert len(shape) > 0, (
+            "The size of 'shape' in reshape can't be zero, "
+            "but received %s." % len(shape)
+        )
         attrs["shape"] = get_attr_shape(shape)
         if utils._contain_var(shape):
             inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape)
@@ -6595,16 +7046,18 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             actual_shape.stop_gradient = True
             inputs["Shape"] = actual_shape
 
-    out = x if inplace else helper.create_variable_for_type_inference(
-        dtype=x.dtype)
+    out = (
+        x
+        if inplace
+        else helper.create_variable_for_type_inference(dtype=x.dtype)
+    )
     x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="reshape2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="reshape2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return helper.append_activation(out)
 
@@ -6669,10 +7122,22 @@ def squeeze(input, axes, name=None):
         return out
 
     helper = LayerHelper("squeeze", **locals())
-    check_variable_and_dtype(input, 'input', [
-        'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
-        'complex64', 'complex128'
-    ], 'squeeze')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'bool',
+            'int8',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'squeeze',
+    )
     check_type(axes, 'axis/axes', (list, tuple, Variable), 'squeeze')
 
     attrs = {}
@@ -6686,13 +7151,12 @@ def squeeze(input, axes, name=None):
             attrs["axes"] = axes
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type="squeeze2",
-                     inputs={"X": input},
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="squeeze2",
+        inputs={"X": input},
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return out
 
@@ -6742,18 +7206,23 @@ def unsqueeze(input, axes, name=None):
         return _C_ops.unsqueeze(input, axes)
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
-    check_variable_and_dtype(input, 'input', [
-        'float16',
-        'float32',
-        'float64',
-        'bool',
-        'int8',
-        'int16',
-        'int32',
-        'int64',
-        'complex64',
-        'complex128',
-    ], 'unsqueeze')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'bool',
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'unsqueeze',
+    )
     helper = LayerHelper("unsqueeze2", **locals())
     inputs = {"X": input}
     attrs = {}
@@ -6771,13 +7240,12 @@ def unsqueeze(input, axes, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type="unsqueeze2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="unsqueeze2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return out
 
@@ -6841,10 +7309,10 @@ def lod_reset(x, y=None, target_lod=None):
                 out.dims = [6, 1]
 
     Args:
-        x (Variable): Input variable which could be a Tensor or LoDTensor. 
+        x (Variable): Input variable which could be a Tensor or LoDTensor.
                       The data type should be int32, int64, float32 or float64.
-        y (Variable, optional): If provided, output's LoD would be derived from :attr:`y`. 
-                                If y's lod level>0, the data type can be any type. 
+        y (Variable, optional): If provided, output's LoD would be derived from :attr:`y`.
+                                If y's lod level>0, the data type can be any type.
                                 If y's lod level=0, the data type should be int32.
         target_lod (list|tuple, optional): One level LoD which should be considered
                                       as target LoD when :attr:`y` not provided.
@@ -6863,24 +7331,24 @@ def lod_reset(x, y=None, target_lod=None):
             y = fluid.layers.data(name='y', shape=[10, 20], lod_level=2)
             out = fluid.layers.lod_reset(x=x, y=y)
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'lod_reset')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'lod_reset'
+    )
     helper = LayerHelper("lod_reset", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     if y is not None:
         check_type(y, 'y', (Variable), 'lod_reset')
-        #TODO: check y.lod_level = 0 dtype
-        helper.append_op(type="lod_reset",
-                         inputs={
-                             'X': x,
-                             'Y': y
-                         },
-                         outputs={'Out': out})
+        # TODO: check y.lod_level = 0 dtype
+        helper.append_op(
+            type="lod_reset", inputs={'X': x, 'Y': y}, outputs={'Out': out}
+        )
     elif target_lod is not None:
-        helper.append_op(type="lod_reset",
-                         inputs={'X': x},
-                         attrs={'target_lod': target_lod},
-                         outputs={'Out': out})
+        helper.append_op(
+            type="lod_reset",
+            inputs={'X': x},
+            attrs={'target_lod': target_lod},
+            outputs={'Out': out},
+        )
     else:
         raise ValueError("y and target_lod should not be both none.")
     return out
@@ -6907,9 +7375,9 @@ def lod_append(x, level):
                 x.dims = [6, 1]
 
     Args:
-        x (Variable): Input variable which could be a tensor or LoDTensor. 
+        x (Variable): Input variable which could be a tensor or LoDTensor.
                       The data type should be int32, int64, float32 or float64.
-        level (list|tuple|Variable, optional): The LoD level to be appended into LoD of x. 
+        level (list|tuple|Variable, optional): The LoD level to be appended into LoD of x.
                                                If level is variable and its lod level>0, the data type can be any type.
                                                If level is variable and its lod level=0, the data type should be int32.
     Returns:
@@ -6934,8 +7402,9 @@ def lod_append(x, level):
     if (not isinstance(level, Iterable)) and (not isinstance(level, Variable)):
         raise ValueError("Input(level) must be list, tuple or Variable.")
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'lod_append')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'lod_append'
+    )
 
     helper = LayerHelper("lod_append", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -6945,27 +7414,22 @@ def lod_append(x, level):
 
     if isinstance(level, Variable):
         inputs['Y'] = level
-        #TODO: check y.lod_level = 0 dtype
+        # TODO: check y.lod_level = 0 dtype
     else:
         attrs['target_lod'] = level
-    helper.append_op(type="lod_reset",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+    helper.append_op(
+        type="lod_reset", inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
     return out
 
 
-def lrn(input,
-        n=5,
-        k=1.0,
-        alpha=1e-4,
-        beta=0.75,
-        name=None,
-        data_format='NCHW'):
+def lrn(
+    input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None, data_format='NCHW'
+):
     r"""
     :alias_main: paddle.nn.functional.lrn
-	:alias: paddle.nn.functional.lrn,paddle.nn.functional.norm.lrn
-	:old_api: paddle.fluid.layers.lrn
+        :alias: paddle.nn.functional.lrn,paddle.nn.functional.norm.lrn
+        :old_api: paddle.fluid.layers.lrn
 
     This operator implements the Local Response Normalization Layer.
     This layer performs a type of "lateral inhibition" by normalizing over local input regions.
@@ -7023,38 +7487,44 @@ def lrn(input,
 
     if dims != 4:
         raise ValueError(
-            "Input's dimension size of Op(lrn) must be 4, but received %d." %
-            (dims))
+            "Input's dimension size of Op(lrn) must be 4, but received %d."
+            % (dims)
+        )
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
-            "Attr(data_format) of Op(lrn) got wrong value: received " +
-            data_format + " but only NCHW or NHWC supported.")
+            "Attr(data_format) of Op(lrn) got wrong value: received "
+            + data_format
+            + " but only NCHW or NHWC supported."
+        )
 
-    mid_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                        stop_gradient=True)
+    mid_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     lrn_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="lrn",
-                     inputs={"X": input},
-                     outputs={
-                         "Out": lrn_out,
-                         "MidOut": mid_out,
-                     },
-                     attrs={
-                         "n": n,
-                         "k": k,
-                         "alpha": alpha,
-                         "beta": beta,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="lrn",
+        inputs={"X": input},
+        outputs={
+            "Out": lrn_out,
+            "MidOut": mid_out,
+        },
+        attrs={
+            "n": n,
+            "k": k,
+            "alpha": alpha,
+            "beta": beta,
+            "data_format": data_format,
+        },
+    )
 
     return lrn_out
 
 
-def pad(x, paddings, pad_value=0., name=None):
+def pad(x, paddings, pad_value=0.0, name=None):
     r"""
     :alias_main: paddle.nn.functional.pad
-	:alias: paddle.nn.functional.pad,paddle.nn.functional.common.pad
-	:old_api: paddle.fluid.layers.pad
+        :alias: paddle.nn.functional.pad,paddle.nn.functional.common.pad
+        :old_api: paddle.fluid.layers.pad
 
     This op will pad a tensor with a constant value given by :attr:`pad_value`, and the
     padded shape is specified by :attr:`paddings`.
@@ -7105,10 +7575,20 @@ def pad(x, paddings, pad_value=0., name=None):
             x = fluid.data(name='data', shape=[300, 300], dtype='float32')
             out = fluid.layers.pad(x=x, paddings=[0, 1, 1, 2], pad_value=0.)
     """
-    check_variable_and_dtype(x, 'x', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], "pad")
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        "pad",
+    )
 
     check_type(pad_value, 'pad_value', (float, int, Variable), 'pad')
     if isinstance(pad_value, int):
@@ -7117,17 +7597,16 @@ def pad(x, paddings, pad_value=0., name=None):
     helper = LayerHelper('pad', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='pad',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'paddings': paddings,
-                         'pad_value': pad_value
-                     })
+    helper.append_op(
+        type='pad',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'paddings': paddings, 'pad_value': pad_value},
+    )
     return out
 
 
-def pad_constant_like(x, y, pad_value=0., name=None):
+def pad_constant_like(x, y, pad_value=0.0, name=None):
     r"""
     Pad :attr:`y` with :attr:`pad_value`, the number of values padded to
     the edges of each axis is specified by the difference of the shape
@@ -7207,31 +7686,29 @@ def pad_constant_like(x, y, pad_value=0., name=None):
             # out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3]
     """
     check_type(x, 'x', (Variable), 'pad_constant_like')
-    check_variable_and_dtype(y, 'y', ['float32', 'float64', 'int32', 'int64'],
-                             "pad_constant_like")
+    check_variable_and_dtype(
+        y, 'y', ['float32', 'float64', 'int32', 'int64'], "pad_constant_like"
+    )
 
     helper = LayerHelper('pad_constant_like', **locals())
     dtype = helper.input_dtype(input_param_name='y')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='pad_constant_like',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs={'pad_value': float(pad_value)})
+    helper.append_op(
+        type='pad_constant_like',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs={'pad_value': float(pad_value)},
+    )
     return out
 
 
-def label_smooth(label,
-                 prior_dist=None,
-                 epsilon=0.1,
-                 dtype="float32",
-                 name=None):
+def label_smooth(
+    label, prior_dist=None, epsilon=0.1, dtype="float32", name=None
+):
     r"""
     :alias_main: paddle.nn.functional.label_smooth
-	:alias: paddle.nn.functional.label_smooth,paddle.nn.functional.common.label_smooth
-	:old_api: paddle.fluid.layers.label_smooth
+        :alias: paddle.nn.functional.label_smooth,paddle.nn.functional.common.label_smooth
+        :old_api: paddle.fluid.layers.label_smooth
 
     Label smoothing is a mechanism to regularize the classifier layer and is called
     label-smoothing regularization (LSR).
@@ -7288,37 +7765,42 @@ def label_smooth(label,
     if in_dygraph_mode():
         return _C_ops.label_smooth(label, prior_dist, float(epsilon))
 
-    if epsilon > 1. or epsilon < 0.:
+    if epsilon > 1.0 or epsilon < 0.0:
         raise ValueError("The value of epsilon must be between 0 and 1.")
 
     if _non_static_mode():
-        return _legacy_C_ops.label_smooth(label, prior_dist, 'epsilon',
-                                          float(epsilon))
+        return _legacy_C_ops.label_smooth(
+            label, prior_dist, 'epsilon', float(epsilon)
+        )
 
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'label_smooth')
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'label_smooth'
+    )
 
     helper = LayerHelper("label_smooth", **locals())
     label.stop_gradient = True
     smooth_label = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="label_smooth",
-                     inputs={
-                         "X": label,
-                         "PriorDist": prior_dist
-                     } if prior_dist else {"X": label},
-                     outputs={"Out": smooth_label},
-                     attrs={"epsilon": float(epsilon)})
+    helper.append_op(
+        type="label_smooth",
+        inputs={"X": label, "PriorDist": prior_dist}
+        if prior_dist
+        else {"X": label},
+        outputs={"Out": smooth_label},
+        attrs={"epsilon": float(epsilon)},
+    )
     return smooth_label
 
 
 @templatedoc()
-def roi_pool(input,
-             rois,
-             pooled_height=1,
-             pooled_width=1,
-             spatial_scale=1.0,
-             rois_num=None,
-             name=None):
+def roi_pool(
+    input,
+    rois,
+    pooled_height=1,
+    pooled_width=1,
+    spatial_scale=1.0,
+    rois_num=None,
+    name=None,
+):
     """
 
     This operator implements the roi_pooling layer.
@@ -7384,10 +7866,20 @@ def roi_pool(input,
         print(np.array(out).shape)  # (2, 1, 1, 1)
     """
     if _non_static_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
         pool_out, argmaxes = _legacy_C_ops.roi_pool(
-            input, rois, rois_num, "pooled_height", pooled_height,
-            "pooled_width", pooled_width, "spatial_scale", spatial_scale)
+            input,
+            rois,
+            rois_num,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+            "spatial_scale",
+            spatial_scale,
+        )
         return pool_out, argmaxes
 
     check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
@@ -7403,29 +7895,30 @@ def roi_pool(input,
     }
     if rois_num is not None:
         inputs['RoisNum'] = rois_num
-    helper.append_op(type="roi_pool",
-                     inputs=inputs,
-                     outputs={
-                         "Out": pool_out,
-                         "Argmax": argmaxes
-                     },
-                     attrs={
-                         "pooled_height": pooled_height,
-                         "pooled_width": pooled_width,
-                         "spatial_scale": spatial_scale
-                     })
+    helper.append_op(
+        type="roi_pool",
+        inputs=inputs,
+        outputs={"Out": pool_out, "Argmax": argmaxes},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale,
+        },
+    )
     return pool_out
 
 
 @templatedoc()
-def roi_align(input,
-              rois,
-              pooled_height=1,
-              pooled_width=1,
-              spatial_scale=1.0,
-              sampling_ratio=-1,
-              rois_num=None,
-              name=None):
+def roi_align(
+    input,
+    rois,
+    pooled_height=1,
+    pooled_width=1,
+    spatial_scale=1.0,
+    sampling_ratio=-1,
+    rois_num=None,
+    name=None,
+):
     """
 
     ${comment}
@@ -7473,21 +7966,41 @@ def roi_align(input,
                                                rois_num=rois_num)
     """
     if in_dygraph_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        return _C_ops.roi_align(input, rois, rois_num, pooled_height,
-                                pooled_width, spatial_scale, sampling_ratio,
-                                False)
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
+        return _C_ops.roi_align(
+            input,
+            rois,
+            rois_num,
+            pooled_height,
+            pooled_width,
+            spatial_scale,
+            sampling_ratio,
+            False,
+        )
     if _in_legacy_dygraph():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        align_out = _legacy_C_ops.roi_align(input, rois, rois_num,
-                                            "pooled_height", pooled_height,
-                                            "pooled_width", pooled_width,
-                                            "spatial_scale", spatial_scale,
-                                            "sampling_ratio", sampling_ratio)
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
+        align_out = _legacy_C_ops.roi_align(
+            input,
+            rois,
+            rois_num,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+            "spatial_scale",
+            spatial_scale,
+            "sampling_ratio",
+            sampling_ratio,
+        )
         return align_out
 
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'roi_align')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'roi_align'
+    )
     check_variable_and_dtype(rois, 'rois', ['float32', 'float64'], 'roi_align')
     helper = LayerHelper('roi_align', **locals())
     dtype = helper.input_dtype()
@@ -7498,15 +8011,17 @@ def roi_align(input,
     }
     if rois_num is not None:
         inputs['RoisNum'] = rois_num
-    helper.append_op(type="roi_align",
-                     inputs=inputs,
-                     outputs={"Out": align_out},
-                     attrs={
-                         "pooled_height": pooled_height,
-                         "pooled_width": pooled_width,
-                         "spatial_scale": spatial_scale,
-                         "sampling_ratio": sampling_ratio
-                     })
+    helper.append_op(
+        type="roi_align",
+        inputs=inputs,
+        outputs={"Out": align_out},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale,
+            "sampling_ratio": sampling_ratio,
+        },
+    )
     return align_out
 
 
@@ -7552,21 +8067,22 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
             predictions = F.softmax(x)
             loss = F.dice_loss(input=predictions, label=label)
     """
-    return paddle.nn.functional.dice_loss(input,
-                                          label,
-                                          epsilon=epsilon,
-                                          name=name)
-
-
-def image_resize(input,
-                 out_shape=None,
-                 scale=None,
-                 name=None,
-                 resample='BILINEAR',
-                 actual_shape=None,
-                 align_corners=True,
-                 align_mode=1,
-                 data_format='NCHW'):
+    return paddle.nn.functional.dice_loss(
+        input, label, epsilon=epsilon, name=name
+    )
+
+
+def image_resize(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    resample='BILINEAR',
+    actual_shape=None,
+    align_corners=True,
+    align_mode=1,
+    data_format='NCHW',
+):
     """
 
     This op resizes a batch of images.
@@ -7581,19 +8097,19 @@ def image_resize(input,
     future and only use :attr:`out_shape` instead.
 
     Supporting resample methods:
-        'LINEAR' : Linear interpolation 
+        'LINEAR' : Linear interpolation
 
         'BILINEAR' : Bilinear interpolation
 
         'TRILINEAR' : Trilinear interpolation
 
         'NEAREST' : Nearest neighbor interpolation
-        
+
         'BICUBIC' : Bicubic interpolation
-    
-    Linear interpolation is the method of using a line connecting two known quantities 
+
+    Linear interpolation is the method of using a line connecting two known quantities
     to determine the value of an unknown quantity between the two known quantities.
-    
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -7608,7 +8124,7 @@ def image_resize(input,
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    
+
     Bicubic interpolation is an extension of cubic interpolation for interpolating
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
@@ -7707,7 +8223,7 @@ def image_resize(input,
               output: (N,C,D_out,H_out,W_out) where:
 
               D_out = D_{in} * scale_{factor}
-       
+
         Trilinear interpolation:
           if:
               align_corners = False , align_mode = 0
@@ -7722,20 +8238,20 @@ def image_resize(input,
               D_out = D_{in} * scale_{factor}
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
-        
+
 
     For details of linear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Linear_interpolation.
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
 
@@ -7743,8 +8259,8 @@ def image_resize(input,
         input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         out_shape (list|tuple|Variable|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
         scale(float|Variable|None): The multiplier for the input height or width. At
@@ -7772,8 +8288,8 @@ def image_resize(input,
                                input and output tensors are aligned, preserving the values at the
                                corner pixels.
                                Default: True
-        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the fomula in the 
-                            the example code above, it can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , 
+        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the fomula in the
+                            the example code above, it can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 ,
                             can be \'1\' for src_idx = scale*dst_index.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
@@ -7806,65 +8322,65 @@ def image_resize(input,
     Examples:
         .. code-block:: python
 
-	    #declarative mode
-	    import paddle
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    paddle.enable_static()
-	    input = fluid.data(name="input", shape=[None,3,6,10])
+            #declarative mode
+            import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            paddle.enable_static()
+            input = fluid.data(name="input", shape=[None,3,6,10])
 
-	    #1
-	    output = fluid.layers.image_resize(input=input,out_shape=[12,12])
+            #1
+            output = fluid.layers.image_resize(input=input,out_shape=[12,12])
 
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = fluid.layers.image_resize(input=input,out_shape=[12,dim1])
+            #2
+            #x = np.array([2]).astype("int32")
+            #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
+            #fluid.layers.assign(input=x, output=dim1)
+            #output = fluid.layers.image_resize(input=input,out_shape=[12,dim1])
 
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = fluid.layers.image_resize(input=input,out_shape=shape_tensor)
+            #3
+            #x = np.array([3,12]).astype("int32")
+            #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+            #fluid.layers.assign(input=x, output=shape_tensor)
+            #output = fluid.layers.image_resize(input=input,out_shape=shape_tensor)
 
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = fluid.layers.image_resize(input=input,scale=scale_tensor)
+            #4
+            #x = np.array([0.5]).astype("float32")
+            #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
+            #fluid.layers.assign(x,scale_tensor)
+            #output = fluid.layers.image_resize(input=input,scale=scale_tensor)
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
+            input_data = np.random.rand(2,3,6,10).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
-	    print(output_data[0].shape)
+            print(output_data[0].shape)
 
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
+            #1
+            # (2, 3, 12, 12)
+            #2
+            # (2, 3, 12, 2)
+            #3
+            # (2, 3, 3, 12)
+            #4
+            # (2, 3, 3, 5)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.image_resize(input=input, out_shape=[12,12])
-    		print(output.shape)
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.image_resize(input=input, out_shape=[12,12])
+                print(output.shape)
 
-		# [2L, 3L, 12L, 12L]
+                # [2L, 3L, 12L, 12L]
 
     """
     resample_methods = {
@@ -7878,7 +8394,8 @@ def image_resize(input,
     if resample not in resample_methods:
         raise ValueError(
             "The 'resample' of image_resize can only be 'LINEAR', 'BILINEAR', 'TRILINEAR' "
-            "or 'NEAREST' currently.")
+            "or 'NEAREST' currently."
+        )
     resample_type = resample_methods[resample]
 
     if resample == 'LINEAR' and len(input.shape) != 3:
@@ -7900,19 +8417,25 @@ def image_resize(input,
 
     if len(input.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCW` or `NWC` supported for 3-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCW` or `NWC` supported for 3-D input."
+        )
     elif len(input.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCHW` or `NHWC` supported for 4-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCHW` or `NHWC` supported for 4-D input."
+        )
     elif len(input.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCDHW` or `NDHWC` supported for 5-D input."
+        )
 
     def _is_list_or_turple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
+        return isinstance(data, list) or isinstance(data, tuple)
 
     if data_format == 'NCHW' or data_format == 'NCDHW' or data_format == 'NCW':
         data_layout = 'NCHW'
@@ -7927,7 +8450,7 @@ def image_resize(input,
         "interp_method": resample_type,
         "align_corners": align_corners,
         "align_mode": align_mode,
-        "data_layout": data_layout
+        "data_layout": data_layout,
     }
 
     if out_shape is not None:
@@ -7945,16 +8468,17 @@ def image_resize(input,
                         out_shape[i] = dim.numpy()[0]
             if not (_is_list_or_turple_(out_shape)):
                 raise TypeError(
-                    "out_shape should be a list or tuple or Variable.")
+                    "out_shape should be a list or tuple or Variable."
+                )
             # Validate the shape
             contain_var = False
             for dim_idx, dim_size in enumerate(out_shape):
                 if isinstance(dim_size, Variable):
                     contain_var = True
                     continue
-                assert dim_size > 0, (
-                    "Each dimension size given in out_shape must be greater than 0."
-                )
+                assert (
+                    dim_size > 0
+                ), "Each dimension size given in out_shape must be greater than 0."
 
             if contain_var:
                 new_size_tensor = []
@@ -7965,22 +8489,22 @@ def image_resize(input,
                         new_size_tensor.append(dim)
                         size_list.append(-1)
                     else:
-                        assert (isinstance(dim, int))
+                        assert isinstance(dim, int)
                         temp_out = helper.create_variable_for_type_inference(
-                            'int32')
-                        fill_constant([1],
-                                      'int32',
-                                      dim,
-                                      force_cpu=True,
-                                      out=temp_out)
+                            'int32'
+                        )
+                        fill_constant(
+                            [1], 'int32', dim, force_cpu=True, out=temp_out
+                        )
                         new_size_tensor.append(temp_out)
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
 
             if len(input.shape) == 3:
                 if len(out_shape) != 1:
-                    raise ValueError("out_shape length should be 1 for "
-                                     "input 3-D tensor.")
+                    raise ValueError(
+                        "out_shape length should be 1 for " "input 3-D tensor."
+                    )
                 if contain_var:
                     attrs['out_w'] = size_list[0]
                 else:
@@ -7988,8 +8512,9 @@ def image_resize(input,
                     attrs['out_w'] = out_shape[0]
             elif len(input.shape) == 4:
                 if len(out_shape) != 2:
-                    raise ValueError("out_shape length should be 2 for "
-                                     "input 4-D tensor.")
+                    raise ValueError(
+                        "out_shape length should be 2 for " "input 4-D tensor."
+                    )
                 if contain_var:
                     attrs['out_h'] = size_list[0]
                     attrs['out_w'] = size_list[1]
@@ -7999,8 +8524,9 @@ def image_resize(input,
                     attrs['out_w'] = out_shape[1]
             if len(input.shape) == 5:
                 if len(out_shape) != 3:
-                    raise ValueError("out_shape length should be 3 for "
-                                     "input 5-D tensor.")
+                    raise ValueError(
+                        "out_shape length should be 3 for " "input 5-D tensor."
+                    )
                 if contain_var:
                     attrs['out_d'] = size_list[0]
                     attrs['out_h'] = size_list[1]
@@ -8023,7 +8549,8 @@ def image_resize(input,
             attrs['scale'] = float(scale)
         else:
             raise TypeError(
-                "Attr(scale)'s type should be float, int or Variable.")
+                "Attr(scale)'s type should be float, int or Variable."
+            )
 
     if isinstance(actual_shape, Variable):
         warnings.warn(
@@ -8055,31 +8582,35 @@ def image_resize(input,
         return out
 
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='{}_interp'.format(resample_type),
-                     inputs=inputs,
-                     outputs={"Out": out},
-                     attrs=attrs)
+    helper.append_op(
+        type='{}_interp'.format(resample_type),
+        inputs=inputs,
+        outputs={"Out": out},
+        attrs=attrs,
+    )
     return out
 
 
 @templatedoc(op_type="linear_interp")
-def resize_linear(input,
-                  out_shape=None,
-                  scale=None,
-                  name=None,
-                  actual_shape=None,
-                  align_corners=True,
-                  align_mode=1,
-                  data_format='NCW'):
+def resize_linear(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    actual_shape=None,
+    align_corners=True,
+    align_mode=1,
+    data_format='NCW',
+):
     """
     This op resizes the input by performing linear interpolation based on given
     output shape which specified by actual_shape, out_shape and scale
     in priority order.
 
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in 
+    **Warning:** the parameter :attr:`actual_shape` will be deprecated in
     the future and only use :attr:`out_shape` instead.
 
-    Align_corners and align_mode are optional parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation
     method of interpolation can be selected by them.
 
     Example:
@@ -8087,23 +8618,23 @@ def resize_linear(input,
     .. code-block:: text
 
         For scale:
-          
+
             if align_corners = True && out_size > 1 :
 
               scale_factor = (in_size-1.0)/(out_size-1.0)
-            
+
             else:
-              
+
               scale_factor = float(in_size/out_size)
 
         Linear interpolation:
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,W_in)
               output: (N,C,W_out) where:
-              
+
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
           else:
@@ -8116,12 +8647,12 @@ def resize_linear(input,
         input(Variable): 3-D Tensor(NCW), its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         out_shape(list|tuple|Variable|None): Output shape of resize linear
-            layer, the shape is (out_w,). Default: None. If a list, each 
-            element can be an integer or a Tensor Variable with shape: [1]. If a 
+            layer, the shape is (out_w,). Default: None. If a list, each
+            element can be an integer or a Tensor Variable with shape: [1]. If a
             Tensor Variable, its dimension size should be 1.
         scale(float|Variable|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale` must be set. 
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             least one of :attr:`out_shape` or :attr:`scale` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale`.
              Default: None.
         actual_shape(Variable): An optional input to specify output shape
                                 dynamically. If provided, image resize
@@ -8129,75 +8660,86 @@ def resize_linear(input,
                                 :attr:`out_shape` and :attr:`scale` specifying
                                 shape. That is to say actual_shape has the
                                 highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
-                                shape dynamically, because :attr:`actual_shape` 
-                                will be deprecated. When using actual_shape to 
-                                specify output shape, one of :attr:`out_shape` 
-                                and :attr:`scale` should also be set, otherwise 
+                                :attr:`out_shape` if you want to specify output
+                                shape dynamically, because :attr:`actual_shape`
+                                will be deprecated. When using actual_shape to
+                                specify output shape, one of :attr:`out_shape`
+                                and :attr:`scale` should also be set, otherwise
                                 errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         align_mode(bool): ${align_mode_comment}
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCW"`, `"NWC"`.
             The default is `"NCW"`. When it is `"NCW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_width]`.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-	Variable: 3-D tensor(NCW or NWC).
-    
+        Variable: 3-D tensor(NCW or NWC).
+
     Examples:
         .. code-block:: python
-	
-	    #declarative mode
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    input = fluid.data(name="input", shape=[None,3,100])
 
-	    output = fluid.layers.resize_linear(input=input,out_shape=[50,])
+            #declarative mode
+            import paddle.fluid as fluid
+            import numpy as np
+            input = fluid.data(name="input", shape=[None,3,100])
+
+            output = fluid.layers.resize_linear(input=input,out_shape=[50,])
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
- 
-	    input_data = np.random.rand(1,3,100).astype("float32")
+            input_data = np.random.rand(1,3,100).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
- 
-	    print(output_data[0].shape)
 
-	    # (1, 3, 50)
+            print(output_data[0].shape)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            # (1, 3, 50)
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.resize_linear(input=input, out_shape=[50,])
-    		print(output.shape)
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-		# [1L, 3L, 50L]
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.resize_linear(input=input, out_shape=[50,])
+                print(output.shape)
+
+                # [1L, 3L, 50L]
 
     """
 
-    return image_resize(input, out_shape, scale, name, 'LINEAR', actual_shape,
-                        align_corners, align_mode, data_format)
+    return image_resize(
+        input,
+        out_shape,
+        scale,
+        name,
+        'LINEAR',
+        actual_shape,
+        align_corners,
+        align_mode,
+        data_format,
+    )
 
 
 @templatedoc(op_type="bilinear_interp")
-def resize_bilinear(input,
-                    out_shape=None,
-                    scale=None,
-                    name=None,
-                    actual_shape=None,
-                    align_corners=True,
-                    align_mode=1,
-                    data_format='NCHW'):
+def resize_bilinear(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    actual_shape=None,
+    align_corners=True,
+    align_mode=1,
+    data_format='NCHW',
+):
     """
 
     This op resizes the input by performing bilinear interpolation based on given
@@ -8284,86 +8826,97 @@ def resize_bilinear(input,
         name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-	Variable: 4-D tensor(NCHW or NHWC).
+        Variable: 4-D tensor(NCHW or NHWC).
 
     Examples:
         .. code-block:: python
 
-	    #declarative mode
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    import paddle
-	    paddle.enable_static()
-	    input = fluid.data(name="input", shape=[None,3,6,10])
+            #declarative mode
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
+            paddle.enable_static()
+            input = fluid.data(name="input", shape=[None,3,6,10])
 
-	    #1
-	    output = fluid.layers.resize_bilinear(input=input,out_shape=[12,12])
+            #1
+            output = fluid.layers.resize_bilinear(input=input,out_shape=[12,12])
 
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = fluid.layers.resize_bilinear(input=input,out_shape=[12,dim1])
+            #2
+            #x = np.array([2]).astype("int32")
+            #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
+            #fluid.layers.assign(input=x, output=dim1)
+            #output = fluid.layers.resize_bilinear(input=input,out_shape=[12,dim1])
 
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = fluid.layers.resize_bilinear(input=input,out_shape=shape_tensor)
+            #3
+            #x = np.array([3,12]).astype("int32")
+            #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+            #fluid.layers.assign(input=x, output=shape_tensor)
+            #output = fluid.layers.resize_bilinear(input=input,out_shape=shape_tensor)
 
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = fluid.layers.resize_bilinear(input=input,scale=scale_tensor)
+            #4
+            #x = np.array([0.5]).astype("float32")
+            #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
+            #fluid.layers.assign(x,scale_tensor)
+            #output = fluid.layers.resize_bilinear(input=input,scale=scale_tensor)
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
+            input_data = np.random.rand(2,3,6,10).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
-	    print(output_data[0].shape)
+            print(output_data[0].shape)
 
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
+            #1
+            # (2, 3, 12, 12)
+            #2
+            # (2, 3, 12, 2)
+            #3
+            # (2, 3, 3, 12)
+            #4
+            # (2, 3, 3, 5)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.resize_bilinear(input=input, out_shape=[12,12])
-    		print(output.shape)
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.resize_bilinear(input=input, out_shape=[12,12])
+                print(output.shape)
 
-		# [2L, 3L, 12L, 12L]
+                # [2L, 3L, 12L, 12L]
 
     """
 
-    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape,
-                        align_corners, align_mode, data_format)
+    return image_resize(
+        input,
+        out_shape,
+        scale,
+        name,
+        'BILINEAR',
+        actual_shape,
+        align_corners,
+        align_mode,
+        data_format,
+    )
 
 
 @templatedoc(op_type="trilinear_interp")
-def resize_trilinear(input,
-                     out_shape=None,
-                     scale=None,
-                     name=None,
-                     actual_shape=None,
-                     align_corners=True,
-                     align_mode=1,
-                     data_format='NCDHW'):
+def resize_trilinear(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    actual_shape=None,
+    align_corners=True,
+    align_mode=1,
+    data_format='NCDHW',
+):
     """
 
     This op resizes the input by performing trilinear interpolation based on given
@@ -8455,82 +9008,93 @@ def resize_trilinear(input,
     Examples:
         .. code-block:: python
 
-	    #declarative mode
-	    import paddle.fluid as fluid
-	    import paddle
-	    import numpy as np
-	    paddle.enable_static()
-	    input = fluid.data(name="input", shape=[None,3,6,8,10])
+            #declarative mode
+            import paddle.fluid as fluid
+            import paddle
+            import numpy as np
+            paddle.enable_static()
+            input = fluid.data(name="input", shape=[None,3,6,8,10])
 
-	    #1
-	    output = fluid.layers.resize_trilinear(input=input,out_shape=[12,12,12])
+            #1
+            output = fluid.layers.resize_trilinear(input=input,out_shape=[12,12,12])
 
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = fluid.layers.resize_trilinear(input=input,out_shape=[12,dim1,4])
+            #2
+            #x = np.array([2]).astype("int32")
+            #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
+            #fluid.layers.assign(input=x, output=dim1)
+            #output = fluid.layers.resize_trilinear(input=input,out_shape=[12,dim1,4])
 
-	    #3
-	    #x = np.array([3,12,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = fluid.layers.resize_trilinear(input=input,out_shape=shape_tensor)
+            #3
+            #x = np.array([3,12,12]).astype("int32")
+            #shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
+            #fluid.layers.assign(input=x, output=shape_tensor)
+            #output = fluid.layers.resize_trilinear(input=input,out_shape=shape_tensor)
 
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = fluid.layers.resize_trilinear(input=input,scale=scale_tensor)
+            #4
+            #x = np.array([0.5]).astype("float32")
+            #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
+            #fluid.layers.assign(x,scale_tensor)
+            #output = fluid.layers.resize_trilinear(input=input,scale=scale_tensor)
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,3,6,8,10).astype("float32")
+            input_data = np.random.rand(2,3,6,8,10).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
-	    print(output_data[0].shape)
+            print(output_data[0].shape)
 
-	    #1
-	    # (2, 3, 12, 12, 12)
-	    #2
-	    # (2, 3, 12, 2, 4)
-	    #3
-	    # (2, 3, 3, 12, 12)
-	    #4
-	    # (2, 3, 3, 4, 5)
+            #1
+            # (2, 3, 12, 12, 12)
+            #2
+            # (2, 3, 12, 2, 4)
+            #3
+            # (2, 3, 3, 12, 12)
+            #4
+            # (2, 3, 3, 4, 5)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.resize_trilinear(input=input, out_shape=[12,12,12])
-    		print(output.shape)
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.resize_trilinear(input=input, out_shape=[12,12,12])
+                print(output.shape)
 
-		# [2L, 3L, 12L, 12L, 12L]
+                # [2L, 3L, 12L, 12L, 12L]
 
 
 
     """
 
-    return image_resize(input, out_shape, scale, name, 'TRILINEAR',
-                        actual_shape, align_corners, align_mode, data_format)
+    return image_resize(
+        input,
+        out_shape,
+        scale,
+        name,
+        'TRILINEAR',
+        actual_shape,
+        align_corners,
+        align_mode,
+        data_format,
+    )
 
 
 @templatedoc(op_type="nearest_interp")
-def resize_nearest(input,
-                   out_shape=None,
-                   scale=None,
-                   name=None,
-                   actual_shape=None,
-                   align_corners=True,
-                   data_format='NCHW'):
+def resize_nearest(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    actual_shape=None,
+    align_corners=True,
+    data_format='NCHW',
+):
     """
 
     This op resizes the input by performing nearest neighbor interpolation in both the
@@ -8586,7 +9150,7 @@ def resize_nearest(input,
              And :attr:`out_shape` has a higher priority than :attr:`scale`.
              Default: None.
         name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
-	actual_shape(Variable): An optional input to specify output shape
+        actual_shape(Variable): An optional input to specify output shape
                                 dynamically. If provided, image resize
                                 according to this given shape rather than
                                 :attr:`out_shape` and :attr:`scale` specifying
@@ -8606,85 +9170,87 @@ def resize_nearest(input,
             `[batch_size, input_channels, input_height, input_width]`.
 
     Returns:
-	Variable: 4-D tensor(NCHW or NHWC).
+        Variable: 4-D tensor(NCHW or NHWC).
 
     Examples:
         .. code-block:: python
 
-	    #declarative mode
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    import paddle
-	    paddle.enable_static()
+            #declarative mode
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
+            paddle.enable_static()
 
-	    input = fluid.data(name="input", shape=[None,3,6,10])
+            input = fluid.data(name="input", shape=[None,3,6,10])
 
-	    #1
-	    output = fluid.layers.resize_nearest(input=input,out_shape=[12,12])
+            #1
+            output = fluid.layers.resize_nearest(input=input,out_shape=[12,12])
 
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = fluid.layers.resize_nearest(input=input,out_shape=[12,dim1])
+            #2
+            #x = np.array([2]).astype("int32")
+            #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
+            #fluid.layers.assign(input=x, output=dim1)
+            #output = fluid.layers.resize_nearest(input=input,out_shape=[12,dim1])
 
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = fluid.layers.resize_nearest(input=input,out_shape=shape_tensor)
+            #3
+            #x = np.array([3,12]).astype("int32")
+            #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+            #fluid.layers.assign(input=x, output=shape_tensor)
+            #output = fluid.layers.resize_nearest(input=input,out_shape=shape_tensor)
 
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = fluid.layers.resize_nearest(input=input,scale=scale_tensor)
+            #4
+            #x = np.array([0.5]).astype("float32")
+            #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
+            #fluid.layers.assign(x,scale_tensor)
+            #output = fluid.layers.resize_nearest(input=input,scale=scale_tensor)
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
+            input_data = np.random.rand(2,3,6,10).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
-	    print(output_data[0].shape)
+            print(output_data[0].shape)
 
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
+            #1
+            # (2, 3, 12, 12)
+            #2
+            # (2, 3, 12, 2)
+            #3
+            # (2, 3, 3, 12)
+            #4
+            # (2, 3, 3, 5)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.resize_nearest(input=input, out_shape=[12,12])
-    		print(output.shape)
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.resize_nearest(input=input, out_shape=[12,12])
+                print(output.shape)
 
-		# [2L, 3L, 12L, 12L]
+                # [2L, 3L, 12L, 12L]
 
 
 
     """
 
-    return image_resize(input,
-                        out_shape,
-                        scale,
-                        name,
-                        'NEAREST',
-                        actual_shape,
-                        align_corners,
-                        align_mode=1,
-                        data_format=data_format)
+    return image_resize(
+        input,
+        out_shape,
+        scale,
+        name,
+        'NEAREST',
+        actual_shape,
+        align_corners,
+        align_mode=1,
+        data_format=data_format,
+    )
 
 
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
@@ -8712,15 +9278,18 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
     in_shape = input.shape
     if len(in_shape) != 4:
         raise ValueError(
-            "The rank of input must be 4 (num_batches, channels, in_h, in_w).")
+            "The rank of input must be 4 (num_batches, channels, in_h, in_w)."
+        )
     hw = in_shape[2:4]
     short_idx = hw.index(min(hw))
     long_idx = 1 - short_idx
     out_shape = list(hw)
     out_shape[short_idx] = out_short_len
     out_shape[long_idx] = int(
-        float(out_shape[long_idx]) *
-        (float(out_short_len) / float(hw[short_idx])) + 0.5)
+        float(out_shape[long_idx])
+        * (float(out_short_len) / float(hw[short_idx]))
+        + 0.5
+    )
     return image_resize(input=input, out_shape=out_shape, resample=resample)
 
 
@@ -8759,12 +9328,12 @@ def gather(input, index, overwrite=True):
         index (Tensor): The index input tensor with rank=1. Data type is int32 or int64.
         overwrite (bool, optional): The mode that updating the grad when has same index.
             If True, use the overwrite mode to update the grad of the same index,
-	    if False, use the accumulate mode to update the grad of the same index.
-	    Default value is True.
+            if False, use the accumulate mode to update the grad of the same index.
+            Default value is True.
 
     Returns:
         output (Tensor): The output is a tensor with the same rank as input.
-    
+
     Examples:
 
         .. code-block:: python
@@ -8781,19 +9350,21 @@ def gather(input, index, overwrite=True):
         return _legacy_C_ops.gather(input, index, None, 'overwrite', overwrite)
 
     check_variable_and_dtype(
-        input, 'x',
-        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], 'gather')
+        input,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'gather',
+    )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="gather",
-                     inputs={
-                         "X": input,
-                         "Index": index
-                     },
-                     outputs={"Out": out},
-                     attrs={'overwrite': overwrite})
+    helper.append_op(
+        type="gather",
+        inputs={"X": input, "Index": index},
+        outputs={"Out": out},
+        attrs={'overwrite': overwrite},
+    )
     return out
 
 
@@ -8878,18 +9449,20 @@ def gather_nd(input, index, name=None):
         if _in_legacy_dygraph():
             return _legacy_C_ops.gather_nd(input, index)
     check_variable_and_dtype(
-        input, 'input',
-        ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'], 'gather_np')
+        input,
+        'input',
+        ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
+        'gather_np',
+    )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="gather_nd",
-                     inputs={
-                         "X": input,
-                         "Index": index
-                     },
-                     outputs={"Out": output})
+    helper.append_op(
+        type="gather_nd",
+        inputs={"X": input, "Index": index},
+        outputs={"Out": output},
+    )
     return output
 
 
@@ -8897,8 +9470,8 @@ def gather_nd(input, index, name=None):
 def scatter(input, index, updates, name=None, overwrite=True):
     """
     :alias_main: paddle.scatter
-	:alias: paddle.scatter,paddle.tensor.scatter,paddle.tensor.manipulation.scatter
-	:old_api: paddle.fluid.layers.scatter
+        :alias: paddle.scatter,paddle.tensor.scatter,paddle.tensor.manipulation.scatter
+        :old_api: paddle.fluid.layers.scatter
 
     **Scatter Layer**
 
@@ -8937,8 +9510,8 @@ def scatter(input, index, updates, name=None, overwrite=True):
         name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
         overwrite (bool): The mode that updating the output when there are same indices.
             If True, use the overwrite mode to update the output of the same index,
-	    if False, use the accumulate mode to update the output of the same index.
-	    Default value is True.
+            if False, use the accumulate mode to update the output of the same index.
+            Default value is True.
 
     Returns:
         Variable(Tensor|LoDTensor): The output is a Tensor with the same shape as input.
@@ -8974,14 +9547,12 @@ def scatter(input, index, updates, name=None, overwrite=True):
     helper = LayerHelper('scatter', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="scatter",
-                     inputs={
-                         "X": input,
-                         "Ids": index,
-                         "Updates": updates
-                     },
-                     attrs={'overwrite': overwrite},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="scatter",
+        inputs={"X": input, "Ids": index, "Updates": updates},
+        attrs={'overwrite': overwrite},
+        outputs={"Out": out},
+    )
     return out
 
 
@@ -9066,13 +9637,11 @@ def scatter_nd_add(ref, index, updates, name=None):
             helper = LayerHelper('scatter_nd_add', **locals())
             dtype = helper.input_dtype(input_param_name='ref')
             output = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type="scatter_nd_add",
-                             inputs={
-                                 "X": ref,
-                                 "Index": index,
-                                 "Updates": updates
-                             },
-                             outputs={"Out": output})
+            helper.append_op(
+                type="scatter_nd_add",
+                inputs={"X": ref, "Index": index, "Updates": updates},
+                outputs={"Out": output},
+            )
             return output
 
 
@@ -9149,9 +9718,9 @@ def random_crop(x, shape, seed=None):
 
     """
     helper = LayerHelper("random_crop", **locals())
-    check_variable_and_dtype(x, 'x',
-                             ['float32', 'float64', 'uint8', 'int16', 'int32'],
-                             'random_crop')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'uint8', 'int16', 'int32'], 'random_crop'
+    )
     check_type(shape, 'shape', (list, Variable), 'random_crop')
     dtype = x.dtype
     out = helper.create_variable_for_type_inference(dtype)
@@ -9163,19 +9732,16 @@ def random_crop(x, shape, seed=None):
         seed = helper.create_variable(
             name=unique_name.generate("random_crop_seed"),
             dtype="int64",
-            persistable=True)
+            persistable=True,
+        )
     elif not isinstance(seed, Variable):
         raise ValueError("'seed' must be a Variable or an int.")
-    helper.append_op(type="random_crop",
-                     inputs={
-                         "X": x,
-                         "Seed": seed
-                     },
-                     outputs={
-                         "Out": out,
-                         "SeedOut": seed
-                     },
-                     attrs=op_attrs)
+    helper.append_op(
+        type="random_crop",
+        inputs={"X": x, "Seed": seed},
+        outputs={"Out": out, "SeedOut": seed},
+        attrs=op_attrs,
+    )
     return out
 
 
@@ -9246,8 +9812,7 @@ def relu(x, name=None):
                 out1 = fluid.layers.relu(x1)
                 print(out1.numpy())
                 # [[0.  0. ]
-                #  [1.  2.6]]
-"""
+                #  [1.  2.6]]"""
 
     if in_dygraph_mode():
         return _C_ops.relu(x)
@@ -9260,9 +9825,9 @@ def relu(x, name=None):
     helper = LayerHelper('relu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="relu",
-                     inputs={"X": helper.input('x')},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}
+    )
     return out
 
 
@@ -9332,10 +9897,9 @@ def selu(x, scale=None, alpha=None, name=None):
     if alpha is not None:
         attrs["alpha"] = alpha
 
-    helper.append_op(type="selu",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs=attrs)
+    helper.append_op(
+        type="selu", inputs={"X": x}, outputs={"Out": out}, attrs=attrs
+    )
     return out
 
 
@@ -9386,23 +9950,23 @@ def mean_iou(input, label, num_classes):
         return _legacy_C_ops.mean_iou(input, label, 'num_classes', num_classes)
 
     helper = LayerHelper('mean_iou', **locals())
-    check_variable_and_dtype(input, 'Predictions', ['int32', 'int64'],
-                             'mean_iou')
+    check_variable_and_dtype(
+        input, 'Predictions', ['int32', 'int64'], 'mean_iou'
+    )
     check_variable_and_dtype(label, 'Labels', ['int32', 'int64'], 'mean_iou')
     out_mean_iou = helper.create_variable_for_type_inference(dtype='float32')
     out_wrong = helper.create_variable_for_type_inference(dtype='int32')
     out_correct = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(type="mean_iou",
-                     inputs={
-                         "Predictions": input,
-                         "Labels": label
-                     },
-                     outputs={
-                         "OutMeanIou": out_mean_iou,
-                         "OutWrong": out_wrong,
-                         "OutCorrect": out_correct
-                     },
-                     attrs={"num_classes": num_classes})
+    helper.append_op(
+        type="mean_iou",
+        inputs={"Predictions": input, "Labels": label},
+        outputs={
+            "OutMeanIou": out_mean_iou,
+            "OutWrong": out_wrong,
+            "OutCorrect": out_correct,
+        },
+        attrs={"num_classes": num_classes},
+    )
     return out_mean_iou, out_wrong, out_correct
 
 
@@ -9499,10 +10063,12 @@ def crop(x, shape=None, offsets=None, name=None):
     else:
         attrs['offsets'] = offsets
 
-    helper.append_op(type='crop',
-                     inputs=ipts,
-                     outputs={'Out': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='crop',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
@@ -9595,11 +10161,13 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
 
     """
     helper = LayerHelper('crop_tensor', **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'crop_tensor')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'crop_tensor'
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'crop_tensor')
-    check_type(offsets, 'offsets', (list, tuple, Variable, type(None)),
-               'crop_tensor')
+    check_type(
+        offsets, 'offsets', (list, tuple, Variable, type(None)), 'crop_tensor'
+    )
 
     if offsets is None:
         offsets = [0] * len(x.shape)
@@ -9612,25 +10180,30 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
         if not isinstance(shape_val, int):
             raise TypeError(
                 "Attr(shape)'s dtype of Op(crop_tensor) should be int32, but received: %s."
-                % type(shape_val))
+                % type(shape_val)
+            )
         if shape_val == 0:
             raise ValueError(
                 "Attr(shape) of Op(crop_tensor) should not be zero, but received: %s."
-                % str(shape_val))
+                % str(shape_val)
+            )
         if shape_val < -1:
             raise ValueError(
                 "When the element in Attr(shape) of Op(crop_tensor) is negative, only -1 is supported, but received: %s."
-                % str(shape_val))
+                % str(shape_val)
+            )
 
     def _attr_offsets_check(offset_val):
         if not isinstance(offset_val, int):
             raise TypeError(
                 "Attr(offsets)'s dtype of Op(crop_tensor) should be int32, but received: %s."
-                % type(offset_val))
+                % type(offset_val)
+            )
         if offset_val < 0:
             raise ValueError(
                 "Attr(offsets) of Op(crop_tensor) should be greater or equal to zero, but received: %s."
-                % str(offset_val))
+                % str(offset_val)
+            )
 
     if isinstance(offsets, Variable):
         offsets.stop_gradient = True
@@ -9671,11 +10244,9 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
             else:
                 _attr_shape_check(dim_size)
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1],
-                              'int32',
-                              dim_size,
-                              force_cpu=True,
-                              out=temp_out)
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out
+                )
                 new_shape_tensor.append(temp_out)
                 shape_attr.append(dim_size)
         ipts['ShapeTensor'] = new_shape_tensor
@@ -9685,18 +10256,20 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
             _attr_shape_check(dim_size)
         attrs['shape'] = shape
 
-    helper.append_op(type='crop_tensor',
-                     inputs=ipts,
-                     outputs={'Out': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='crop_tensor',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
 def affine_grid(theta, out_shape, name=None):
     """
     :alias_main: paddle.nn.functional.affine_grid
-	:alias: paddle.nn.functional.affine_grid,paddle.nn.functional.vision.affine_grid
-	:old_api: paddle.fluid.layers.affine_grid
+        :alias: paddle.nn.functional.affine_grid,paddle.nn.functional.vision.affine_grid
+        :old_api: paddle.fluid.layers.affine_grid
 
     It generates a grid of (x,y) coordinates using the parameters of
     the affine transformation that correspond to a set of points where
@@ -9739,11 +10312,15 @@ def affine_grid(theta, out_shape, name=None):
     """
     helper = LayerHelper('affine_grid')
 
-    check_variable_and_dtype(theta, 'theta', ['float32', 'float64'],
-                             'affine_grid')
-
-    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
-            isinstance(out_shape, Variable)):
+    check_variable_and_dtype(
+        theta, 'theta', ['float32', 'float64'], 'affine_grid'
+    )
+
+    if not (
+        isinstance(out_shape, list)
+        or isinstance(out_shape, tuple)
+        or isinstance(out_shape, Variable)
+    ):
         raise ValueError("The out_shape should be a list, tuple or Variable.")
 
     if not isinstance(theta, Variable):
@@ -9754,27 +10331,32 @@ def affine_grid(theta, out_shape, name=None):
     attrs = {}
     if isinstance(out_shape, Variable):
         ipts['OutputShape'] = out_shape
-        check_variable_and_dtype(out_shape, 'out_shape', ['int32'],
-                                 'affine_grid')
+        check_variable_and_dtype(
+            out_shape, 'out_shape', ['int32'], 'affine_grid'
+        )
     else:
         attrs['output_shape'] = out_shape
     if core.is_compiled_with_rocm():
         # ROCM platform do not have MIOPEN kernel for affine_grid
         attrs['use_cudnn'] = False
 
-    helper.append_op(type='affine_grid',
-                     inputs=ipts,
-                     outputs={'Output': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
-def pad2d(input,
-          paddings=[0, 0, 0, 0],
-          mode='constant',
-          pad_value=0.0,
-          data_format="NCHW",
-          name=None):
+def pad2d(
+    input,
+    paddings=[0, 0, 0, 0],
+    mode='constant',
+    pad_value=0.0,
+    data_format="NCHW",
+    name=None,
+):
     """
 
     Pad 2-d images according to 'paddings' and 'mode'.
@@ -9788,10 +10370,10 @@ def pad2d(input,
             Otherwise, it is a 1-D Tensor with shape [4]. Data type is int32.
             Default is [0, 0, 0, 0].
         mode (str): Three modes: 'constant' (default), 'reflect', 'edge' .
-        	When in 'constant' mode, this op uses a constant value to pad the input tensor.
-        	When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
-        	When in 'edge' mode, uses input boundaries to pad the input tensor.
-        	Default is 'constant'
+                When in 'constant' mode, this op uses a constant value to pad the input tensor.
+                When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+                When in 'edge' mode, uses input boundaries to pad the input tensor.
+                Default is 'constant'
         pad_value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0
         data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
                            the input data.
@@ -9799,7 +10381,7 @@ def pad2d(input,
         name (str, optional) : The default value is None.  Normally there is no need for
                     user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
 
-    Returns: 
+    Returns:
         Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input.
 
     Examples:
@@ -9862,15 +10444,29 @@ def pad2d(input,
             #    [2. 1. 2. 3. 2.]]]]
     """
     if _non_static_mode():
-        _paddings = paddings.numpy().tolist() if isinstance(
-            paddings, Variable) else paddings
-        return _legacy_C_ops.pad2d(input, 'mode', mode, 'pad_value', pad_value,
-                                   'data_format', data_format, 'paddings',
-                                   _paddings)
+        _paddings = (
+            paddings.numpy().tolist()
+            if isinstance(paddings, Variable)
+            else paddings
+        )
+        return _legacy_C_ops.pad2d(
+            input,
+            'mode',
+            mode,
+            'pad_value',
+            pad_value,
+            'data_format',
+            data_format,
+            'paddings',
+            _paddings,
+        )
 
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        "pad2d")
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        "pad2d",
+    )
 
     attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format}
     inputs = {'X': [input]}
@@ -9882,16 +10478,18 @@ def pad2d(input,
 
     helper = LayerHelper('pad2d', **locals())
 
-    assert mode in ['reflect', 'edge', 'constant'
-                    ], "mode should be one of constant, reflect, edge."
+    assert mode in [
+        'reflect',
+        'edge',
+        'constant',
+    ], "mode should be one of constant, reflect, edge."
 
     dtype = helper.input_dtype(input_param_name='input')
     out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type='pad2d',
-                     inputs=inputs,
-                     outputs={"Out": out},
-                     attrs=attrs)
+    helper.append_op(
+        type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs
+    )
 
     return out
 
@@ -9900,8 +10498,8 @@ def pad2d(input,
 def elu(x, alpha=1.0, name=None):
     """
     :alias_main: paddle.nn.functional.elu
-	:alias: paddle.nn.functional.elu,paddle.nn.functional.activation.elu
-	:old_api: paddle.fluid.layers.elu
+        :alias: paddle.nn.functional.elu,paddle.nn.functional.activation.elu
+        :old_api: paddle.fluid.layers.elu
 
     ${comment}
     Args:
@@ -9930,10 +10528,12 @@ def elu(x, alpha=1.0, name=None):
     helper = LayerHelper('elu', **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='elu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'alpha': alpha})
+    helper.append_op(
+        type='elu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha},
+    )
     return out
 
 
@@ -9971,13 +10571,15 @@ def relu6(x, threshold=6.0, name=None):
 
     helper = LayerHelper('relu6', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='relu6',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'threshold': threshold,
-                         'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"]
-                     })
+    helper.append_op(
+        type='relu6',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={
+            'threshold': threshold,
+            'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"],
+        },
+    )
     return out
 
 
@@ -10014,7 +10616,8 @@ def pow(x, factor=1.0, name=None):
             # y_2 is x^{3.0}
     """
     check_variable_and_dtype(
-        x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow')
+        x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow'
+    )
 
     helper = LayerHelper('pow', **locals())
     inputs = {'X': x}
@@ -10027,10 +10630,9 @@ def pow(x, factor=1.0, name=None):
         attrs['factor'] = factor
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='pow',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -10070,13 +10672,12 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
 
     helper = LayerHelper('stanh', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='stanh',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'scale_a': scale_a,
-                         'scale_b': scale_b
-                     })
+    helper.append_op(
+        type='stanh',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale_a': scale_a, 'scale_b': scale_b},
+    )
     return out
 
 
@@ -10109,18 +10710,18 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
     if _non_static_mode():
         return _legacy_C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hard_sigmoid')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hard_sigmoid'
+    )
 
     helper = LayerHelper('hard_sigmoid', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='hard_sigmoid',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'slope': slope,
-                         'offset': offset
-                     })
+    helper.append_op(
+        type='hard_sigmoid',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': slope, 'offset': offset},
+    )
     return out
 
 
@@ -10128,8 +10729,8 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
 def swish(x, beta=1.0, name=None):
     r"""
     :alias_main: paddle.nn.functional.swish
-	:alias: paddle.nn.functional.swish,paddle.nn.functional.activation.swish
-	:old_api: paddle.fluid.layers.swish
+        :alias: paddle.nn.functional.swish,paddle.nn.functional.activation.swish
+        :old_api: paddle.fluid.layers.swish
 
     Elementwise swish activation function. See `Searching for Activation Functions <https://arxiv.org/abs/1710.05941>`_ for more details.
 
@@ -10201,16 +10802,19 @@ def swish(x, beta=1.0, name=None):
 
     helper = LayerHelper('swish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='swish',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'slope': beta})
+    helper.append_op(
+        type='swish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': beta},
+    )
     return out
 
 
 @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu")
 def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
     r"""
+
     prelu activation.
 
     .. math::
@@ -10225,26 +10829,19 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
         element: All elements do not share alpha. Each element has its own alpha.
 
     Parameters:
-    
         x (Tensor): The input Tensor or LoDTensor with data type float32.
-
         mode (str): The mode for weight sharing.
-
-        param_attr (ParamAttr|None, optional): The parameter attribute for the learnable \
-        weight (alpha), it can be create by ParamAttr. None by default. \
-        For detailed information, please refer to :ref:`api_fluid_ParamAttr`.
-
-        name (str, optional): Name for the operation (optional, default is None). \
-        For more information, please refer to :ref:`api_guide_Name`.
-        
+        param_attr (ParamAttr|None, optional): The parameter attribute for the learnable
+            weight (alpha), it can be create by ParamAttr. None by default.
         data_format(str, optional): Data format that specifies the layout of input.
             It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: A tensor with the same shape and data type as x.
+        Tensor, A tensor with the same shape and data type as x.
 
     Examples:
-
         .. code-block:: python
 
             import paddle
@@ -10265,52 +10862,57 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
     if mode == 'channel':
 
         true_data_format = [
-            'NC', 'NCL', 'NCHW', 'NCDHW', 'NLC', 'NHWC', 'NDHWC'
+            'NC',
+            'NCL',
+            'NCHW',
+            'NCDHW',
+            'NLC',
+            'NHWC',
+            'NDHWC',
         ]
         if data_format not in true_data_format:
             raise ValueError(
                 "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
+                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+            )
 
         data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
 
-        assert len(
-            x.shape
-        ) >= 2, "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
-        #NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
+        assert (
+            len(x.shape) >= 2
+        ), "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
+        # NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
         # To be consistent with Prelu, it is simplified.
-        #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
-        #NOTE(GuoxiaWang): support NHWC data format
+        # NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
+        # NOTE(GuoxiaWang): support NHWC data format
         if data_format == 'NHWC':
             alpha_shape = [1, 1, 1, x.shape[-1]]
         else:
             alpha_shape = [1, x.shape[1], 1, 1]
 
     elif mode == 'element':
-        assert len(
-            x.shape
-        ) >= 1, "The size of input shape should be equal or larger than 1 in prelu() when mode is 'element'"
+        assert (
+            len(x.shape) >= 1
+        ), "The size of input shape should be equal or larger than 1 in prelu() when mode is 'element'"
         alpha_shape = [1] + list(x.shape)[1:]
     dtype = helper.input_dtype(input_param_name='x')
-    alpha = helper.create_parameter(attr=helper.param_attr,
-                                    shape=alpha_shape,
-                                    dtype=dtype,
-                                    is_bias=False,
-                                    default_initializer=Constant(0.25))
+    alpha = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=alpha_shape,
+        dtype=dtype,
+        is_bias=False,
+        default_initializer=Constant(0.25),
+    )
     if in_dygraph_mode():
         return _C_ops.prelu(x, alpha, data_format, mode)
 
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="prelu",
-                     inputs={
-                         "X": x,
-                         'Alpha': alpha
-                     },
-                     attrs={
-                         "mode": mode,
-                         "data_format": data_format
-                     },
-                     outputs={"Out": out})
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x, 'Alpha': alpha},
+        attrs={"mode": mode, "data_format": data_format},
+        outputs={"Out": out},
+    )
     return out
 
 
@@ -10351,13 +10953,12 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
 
     helper = LayerHelper('brelu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='brelu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         't_min': t_min,
-                         't_max': t_max
-                     })
+    helper.append_op(
+        type='brelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'t_min': t_min, 't_max': t_max},
+    )
     return out
 
 
@@ -10424,15 +11025,18 @@ def soft_relu(x, threshold=40.0, name=None):
             res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
             print(res) # [array([[0.6931472, 1.3132616], [2.126928 , 3.0485873]], dtype=float32)]
     """
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'soft_relu')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'soft_relu'
+    )
 
     helper = LayerHelper('soft_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='soft_relu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='soft_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
@@ -10501,8 +11105,11 @@ def flatten(x, axis=1, name=None):
             # out shape is [16, 3]
     """
     check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
-        'flatten')
+        x,
+        'x',
+        ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
+        'flatten',
+    )
     if _non_static_mode():
         return _legacy_C_ops.flatten2(x, 'axis', axis)[0]
 
@@ -10516,13 +11123,12 @@ def flatten(x, axis=1, name=None):
 
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='flatten2',
-                     inputs={"X": x},
-                     outputs={
-                         'Out': out,
-                         'XShape': x_shape
-                     },
-                     attrs={"axis": axis})
+    helper.append_op(
+        type='flatten2',
+        inputs={"X": x},
+        outputs={'Out': out, 'XShape': x_shape},
+        attrs={"axis": axis},
+    )
     return out
 
 
@@ -10582,10 +11188,10 @@ def stack(x, axis=0, name=None):
                                      Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
                                      Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
-                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
                               If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
         name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
-    
+
 
     Returns:
         Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
@@ -10616,42 +11222,53 @@ def stack(x, axis=0, name=None):
     if not isinstance(x, list) and not isinstance(x, tuple):
         # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
         # In that case, Variable is array of tensors indeed.
-        if isinstance(x, Variable) and x.desc.type(
-        ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        if (
+            isinstance(x, Variable)
+            and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        ):
             x = [x]
         else:
             raise TypeError(
-                "The type of '%s' in %s must be %s, but received %s" %
-                ('x', 'stack', 'list[Tensor], tuple[Tensor] or TensorArray',
-                 type(x)))
+                "The type of '%s' in %s must be %s, but received %s"
+                % (
+                    'x',
+                    'stack',
+                    'list[Tensor], tuple[Tensor] or TensorArray',
+                    type(x),
+                )
+            )
 
     helper = LayerHelper('stack', **locals())
 
     out = helper.create_variable_for_type_inference(x[0].dtype)
     if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
-                            "number of the elements must be 1, but received %s." % len(x)
+        assert len(x) == 1, (
+            "If the elements of 'x' in stack are Variable(LoDTensorArray), "
+            "number of the elements must be 1, but received %s." % len(x)
+        )
         out_index = helper.create_variable_for_type_inference(dtype="int32")
 
         for i in x:
-            check_variable_and_dtype(i, 'x', \
-                ['float16', 'float32', 'float64', 'int32', 'int64'], 'stack')
-
-        helper.append_op(type='tensor_array_to_tensor',
-                         inputs={'X': x[0]},
-                         outputs={
-                             'Out': [out],
-                             'OutIndex': [out_index]
-                         },
-                         attrs={
-                             'axis': axis,
-                             'use_stack': True
-                         })
+            check_variable_and_dtype(
+                i,
+                'x',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'stack',
+            )
+
+        helper.append_op(
+            type='tensor_array_to_tensor',
+            inputs={'X': x[0]},
+            outputs={'Out': [out], 'OutIndex': [out_index]},
+            attrs={'axis': axis, 'use_stack': True},
+        )
     else:
-        helper.append_op(type='stack',
-                         inputs={'X': x},
-                         outputs={'Y': out},
-                         attrs={'axis': axis})
+        helper.append_op(
+            type='stack',
+            inputs={'X': x},
+            outputs={'Y': out},
+            attrs={'axis': axis},
+        )
 
     return out
 
@@ -10715,21 +11332,12 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
     out = helper.create_variable_for_type_inference(dtype=ins.dtype)
     loss_weight = helper.create_variable_for_type_inference(dtype=np.float64)
     mmap = helper.create_variable_for_type_inference(dtype=ins_tag.dtype)
-    helper.append_op(type='filter_by_instag',
-                     inputs={
-                         'Ins': ins,
-                         'Ins_tag': ins_tag,
-                         'Filter_tag': filter_tag
-                     },
-                     outputs={
-                         'Out': out,
-                         'LossWeight': loss_weight,
-                         'IndexMap': mmap
-                     },
-                     attrs={
-                         'is_lod': is_lod,
-                         'out_val_if_empty': out_val_if_empty
-                     })
+    helper.append_op(
+        type='filter_by_instag',
+        inputs={'Ins': ins, 'Ins_tag': ins_tag, 'Filter_tag': filter_tag},
+        outputs={'Out': out, 'LossWeight': loss_weight, 'IndexMap': mmap},
+        attrs={'is_lod': is_lod, 'out_val_if_empty': out_val_if_empty},
+    )
 
     return [out, loss_weight]
 
@@ -10737,8 +11345,8 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
 def unstack(x, axis=0, num=None):
     """
     :alias_main: paddle.unstack
-	:alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
-	:old_api: paddle.fluid.layers.unstack
+        :alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
+        :old_api: paddle.fluid.layers.unstack
 
     **UnStack Layer**
 
@@ -10787,13 +11395,12 @@ def unstack(x, axis=0, num=None):
     for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
-    helper.append_op(type='unstack',
-                     inputs={'X': [x]},
-                     outputs={'Y': outs},
-                     attrs={
-                         'axis': axis,
-                         'num': num
-                     })
+    helper.append_op(
+        type='unstack',
+        inputs={'X': [x]},
+        outputs={'Y': outs},
+        attrs={'axis': axis, 'num': num},
+    )
     return outs
 
 
@@ -10801,8 +11408,8 @@ def unstack(x, axis=0, num=None):
 def expand(x, expand_times, name=None):
     """
     :alias_main: paddle.expand
-	:alias: paddle.expand,paddle.tensor.expand,paddle.tensor.manipulation.expand
-	:old_api: paddle.fluid.layers.expand
+        :alias: paddle.expand,paddle.tensor.expand,paddle.tensor.manipulation.expand
+        :old_api: paddle.fluid.layers.expand
 
     This operation tiles ``x`` multiple times according to the parameter ``expand_times``.
     The times number for each dimension of ``x`` is set by the parameter ``expand_times``.
@@ -10876,12 +11483,16 @@ def expand(x, expand_times, name=None):
     inputs = {"X": [x]}
     attrs = {}
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'expand')
+        x,
+        'x',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand',
+    )
     check_type(expand_times, 'expand_times', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == True:
         raise ValueError(
-            "expand op bool date type must set the stop_gradient to be False")
+            "expand op bool date type must set the stop_gradient to be False"
+        )
 
     helper = LayerHelper('expand', input=x, **locals())
 
@@ -10892,8 +11503,9 @@ def expand(x, expand_times, name=None):
                 attrs_expand_times.append(-1)
             else:
                 attrs_expand_times.append(times)
-                assert times > 0, (
-                    "Each element given in expand_times must not be negative.")
+                assert (
+                    times > 0
+                ), "Each element given in expand_times must not be negative."
         return attrs_expand_times
 
     if isinstance(expand_times, Variable):
@@ -10903,14 +11515,14 @@ def expand(x, expand_times, name=None):
         attrs['expand_times'] = get_attr_expand_times(expand_times)
         if utils._contain_var(expand_times):
             inputs['expand_times_tensor'] = utils._convert_to_tensor_list(
-                expand_times)
+                expand_times
+            )
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='expand',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='expand', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -10918,9 +11530,9 @@ def expand(x, expand_times, name=None):
 def expand_as(x, target_tensor, name=None):
     """
     :alias_main: paddle.expand_as
-	:alias: paddle.expand_as,paddle.tensor.expand_as,paddle.tensor.manipulation.expand_as
-	:old_api: paddle.fluid.layers.expand_as
-    
+        :alias: paddle.expand_as,paddle.tensor.expand_as,paddle.tensor.manipulation.expand_as
+        :old_api: paddle.fluid.layers.expand_as
+
     expand_as operator tiles to the input by given expand tensor. You should set expand tensor
     for each dimension by providing tensor 'target_tensor'. The rank of X
     should be in [1, 6]. Please note that size of 'target_tensor' must be the same
@@ -10985,12 +11597,15 @@ def expand_as(x, target_tensor, name=None):
     if _non_static_mode():
         return _legacy_C_ops.expand_as(x, target_tensor)
 
-    check_variable_and_dtype(x, 'x',
-                             ['float32', 'float64', 'int32', 'int64', 'bool'],
-                             'expand_as')
-    check_variable_and_dtype(target_tensor, 'target_tensor',
-                             ['float32', 'float64', 'int32', 'int64', 'bool'],
-                             'expand_as')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64', 'bool'], 'expand_as'
+    )
+    check_variable_and_dtype(
+        target_tensor,
+        'target_tensor',
+        ['float32', 'float64', 'int32', 'int64', 'bool'],
+        'expand_as',
+    )
     helper = LayerHelper('expand_as', input=x, **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
@@ -11004,14 +11619,16 @@ from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
 @deprecated(since='1.8.0', update_to="paddle.uniform")
 @templatedoc()
-def uniform_random_batch_size_like(input,
-                                   shape,
-                                   dtype='float32',
-                                   input_dim_idx=0,
-                                   output_dim_idx=0,
-                                   min=-1.0,
-                                   max=1.0,
-                                   seed=0):
+def uniform_random_batch_size_like(
+    input,
+    shape,
+    dtype='float32',
+    input_dim_idx=0,
+    output_dim_idx=0,
+    min=-1.0,
+    max=1.0,
+    seed=0,
+):
     """
     This OP initializes a variable with random values sampled from a
     uniform distribution in the range [min, max). The input_dim_idx used to get the input dimension value which will be used to resize the output dimension.
@@ -11074,39 +11691,46 @@ def uniform_random_batch_size_like(input,
 
 
     """
-    check_variable_and_dtype(input, 'Input', ("float32", 'float64', "uint16"),
-                             'uniform_random_batch_size_like')
+    check_variable_and_dtype(
+        input,
+        'Input',
+        ("float32", 'float64', "uint16"),
+        'uniform_random_batch_size_like',
+    )
     check_type(shape, 'shape', (list, tuple), 'uniform_random_batch_size_like')
-    check_dtype(dtype, 'dtype', ('float32', 'float64', "uint16"),
-                'uniform_random_batch_size_like')
+    check_dtype(
+        dtype,
+        'dtype',
+        ('float32', 'float64', "uint16"),
+        'uniform_random_batch_size_like',
+    )
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
     out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(type='uniform_random_batch_size_like',
-                     inputs={'Input': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'shape': shape,
-                         'input_dim_idx': input_dim_idx,
-                         'output_dim_idx': output_dim_idx,
-                         'min': min,
-                         'max': max,
-                         'seed': seed,
-                         'dtype': c_dtype
-                     })
+    helper.append_op(
+        type='uniform_random_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx,
+            'min': min,
+            'max': max,
+            'seed': seed,
+            'dtype': c_dtype,
+        },
+    )
 
     return out
 
 
 @deprecated(since="2.0.0", update_to="paddle.normal")
 @templatedoc()
-def gaussian_random(shape,
-                    mean=0.0,
-                    std=1.0,
-                    seed=0,
-                    dtype='float32',
-                    name=None):
+def gaussian_random(
+    shape, mean=0.0, std=1.0, seed=0, dtype='float32', name=None
+):
     """
     This OP returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
@@ -11162,21 +11786,21 @@ def gaussian_random(shape,
             # result_3 is:
             # [[-0.12310527,  0.8187662,   1.923219  ]
             #  [ 0.70721835,  0.5210541,  -0.03214082]]
-       
+
        .. code-block:: python
-       
+
            # declarative mode
            # required: skiptest
            import numpy as np
            from paddle import fluid
-   
+
            x = fluid.layers.gaussian_random((2, 3), std=2., seed=10)
-   
+
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            start = fluid.default_startup_program()
            main = fluid.default_main_program()
-   
+
            exe.run(start)
            x_np, = exe.run(main, feed={}, fetch_list=[x])
 
@@ -11190,11 +11814,11 @@ def gaussian_random(shape,
            import numpy as np
            from paddle import fluid
            import paddle.fluid.dygraph as dg
-    
+
            place = fluid.CPUPlace()
            with dg.guard(place) as g:
                x = fluid.layers.gaussian_random((2, 4), mean=2., dtype="float32", seed=10)
-               x_np = x.numpy()       
+               x_np = x.numpy()
            x_np
            # array([[2.3060477 , 2.676496  , 3.9911983 , 0.9990833 ],
            #        [2.8675377 , 2.2279181 , 0.79029655, 2.8447366 ]], dtype=float32)
@@ -11205,15 +11829,24 @@ def gaussian_random(shape,
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
         place = _current_expected_place()
-        return _C_ops.gaussian_random(shape, float(mean), float(std), seed,
-                                      dtype, place)
+        return _C_ops.gaussian_random(
+            shape, float(mean), float(std), seed, dtype, place
+        )
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _legacy_C_ops.gaussian_random('shape', shape,
-                                             'mean', float(mean), 'std',
-                                             float(std), 'seed', seed, 'dtype',
-                                             dtype)
+        return _legacy_C_ops.gaussian_random(
+            'shape',
+            shape,
+            'mean',
+            float(mean),
+            'std',
+            float(std),
+            'seed',
+            seed,
+            'dtype',
+            dtype,
+        )
 
     check_type(shape, 'shape', (list, tuple, Variable), 'gaussian_random/randn')
     check_dtype(dtype, 'dtype', ['float32', 'float64'], 'gaussian_random/randn')
@@ -11224,19 +11857,17 @@ def gaussian_random(shape,
         'std': std,
         'seed': seed,
         'dtype': dtype,
-        'use_mkldnn': False
+        'use_mkldnn': False,
     }
-    utils.get_shape_tensor_inputs(inputs=inputs,
-                                  attrs=attrs,
-                                  shape=shape,
-                                  op_type='gaussian_random/randn')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='gaussian_random/randn'
+    )
 
     helper = LayerHelper('gaussian_random', **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='gaussian_random',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='gaussian_random', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
 
     return out
 
@@ -11270,28 +11901,28 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
 
     helper = LayerHelper('sampling_id', **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='sampling_id',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'min': min,
-                         'max': max,
-                         'seed': seed
-                     })
+    helper.append_op(
+        type='sampling_id',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'min': min, 'max': max, 'seed': seed},
+    )
 
     return out
 
 
 @deprecated(since='1.8.0', update_to="paddle.normal")
 @templatedoc()
-def gaussian_random_batch_size_like(input,
-                                    shape,
-                                    input_dim_idx=0,
-                                    output_dim_idx=0,
-                                    mean=0.0,
-                                    std=1.0,
-                                    seed=0,
-                                    dtype='float32'):
+def gaussian_random_batch_size_like(
+    input,
+    shape,
+    input_dim_idx=0,
+    output_dim_idx=0,
+    mean=0.0,
+    std=1.0,
+    seed=0,
+    dtype='float32',
+):
     """
     ${comment}
 
@@ -11322,26 +11953,40 @@ def gaussian_random_batch_size_like(input,
     """
 
     helper = LayerHelper('gaussian_random_batch_size_like', **locals())
-    check_type(input, 'input', (Variable),
-               'fluid.layers.gaussian_random_batch_size_like')
-    check_type(shape, 'shape', (list, tuple),
-               'fluid.layers.gaussian_random_batch_size_like')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'int'],
-                'fluid.layers.gaussian_random_batch_size_like')
+    check_type(
+        input,
+        'input',
+        (Variable),
+        'fluid.layers.gaussian_random_batch_size_like',
+    )
+    check_type(
+        shape,
+        'shape',
+        (list, tuple),
+        'fluid.layers.gaussian_random_batch_size_like',
+    )
+    check_dtype(
+        dtype,
+        'dtype',
+        ['float16', 'float32', 'int'],
+        'fluid.layers.gaussian_random_batch_size_like',
+    )
     out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(type='gaussian_random_batch_size_like',
-                     inputs={'Input': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'shape': shape,
-                         'input_dim_idx': input_dim_idx,
-                         'output_dim_idx': output_dim_idx,
-                         'mean': mean,
-                         'std': std,
-                         'seed': seed,
-                         'dtype': c_dtype
-                     })
+    helper.append_op(
+        type='gaussian_random_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx,
+            'mean': mean,
+            'std': std,
+            'seed': seed,
+            'dtype': c_dtype,
+        },
+    )
 
     return out
 
@@ -11453,7 +12098,7 @@ def slice(input, axes, starts, ends):
                 ends = [-1, 1000]       # -1 denotes the reverse 0th position of dimension 0.
             Then:
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
-    
+
     Args:
         input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
@@ -11500,7 +12145,8 @@ def slice(input, axes, starts, ends):
             axes = list(axes)
             if len(axes) == 0:
                 raise ValueError(
-                    "Input axes should not be an empty list/tuple.")
+                    "Input axes should not be an empty list/tuple."
+                )
             for i in range(len(axes)):
                 if axes[i] < 0:
                     axes[i] = max(0, axes[i] + len(input.shape))
@@ -11509,8 +12155,10 @@ def slice(input, axes, starts, ends):
 
         else:
             raise ValueError(
-                "Input axes must be a python list or tuple, but reveived {}".
-                format(type(axes)))
+                "Input axes must be a python list or tuple, but reveived {}".format(
+                    type(axes)
+                )
+            )
 
         infer_flags = list(1 for i in range(len(axes)))
 
@@ -11518,7 +12166,8 @@ def slice(input, axes, starts, ends):
         if isinstance(starts, (list, tuple)):
             starts = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item
+                if isinstance(item, tmp_tensor_type)
+                else item
                 for item in starts
             ]
         elif isinstance(starts, tmp_tensor_type):
@@ -11528,7 +12177,9 @@ def slice(input, axes, starts, ends):
         if isinstance(ends, (list, tuple)):
             ends = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item for item in ends
+                if isinstance(item, tmp_tensor_type)
+                else item
+                for item in ends
             ]
             attrs += ('ends', ends)
         elif isinstance(ends, tmp_tensor_type):
@@ -11546,7 +12197,8 @@ def slice(input, axes, starts, ends):
                 axes = list(axes)
                 if len(axes) == 0:
                     raise ValueError(
-                        "Input axes should not be an empty list/tuple.")
+                        "Input axes should not be an empty list/tuple."
+                    )
                 for i in range(len(axes)):
                     if axes[i] < 0:
                         axes[i] = max(0, axes[i] + len(input.shape))
@@ -11555,8 +12207,10 @@ def slice(input, axes, starts, ends):
 
             else:
                 raise ValueError(
-                    "Input axes must be a python list or tuple, but reveived {}"
-                    .format(type(axes)))
+                    "Input axes must be a python list or tuple, but reveived {}".format(
+                        type(axes)
+                    )
+                )
 
             infer_flags = list(1 for i in range(len(axes)))
 
@@ -11565,7 +12219,8 @@ def slice(input, axes, starts, ends):
             if isinstance(starts, (list, tuple)):
                 starts = [
                     item.numpy().item(0)
-                    if isinstance(item, tmp_tensor_type) else item
+                    if isinstance(item, tmp_tensor_type)
+                    else item
                     for item in starts
                 ]
                 attrs += ('starts', starts)
@@ -11577,7 +12232,8 @@ def slice(input, axes, starts, ends):
             if isinstance(ends, (list, tuple)):
                 ends = [
                     item.numpy().item(0)
-                    if isinstance(item, tmp_tensor_type) else item
+                    if isinstance(item, tmp_tensor_type)
+                    else item
                     for item in ends
                 ]
                 attrs += ('ends', ends)
@@ -11586,16 +12242,27 @@ def slice(input, axes, starts, ends):
                 ends_tensor.stop_gradient = True
                 infer_flags = list(-1 for i in range(len(axes)))
 
-            return _legacy_C_ops.slice(input, starts_tensor, ends_tensor, None,
-                                       None, 'axes', axes, 'infer_flags',
-                                       infer_flags, *attrs)
+            return _legacy_C_ops.slice(
+                input,
+                starts_tensor,
+                ends_tensor,
+                None,
+                None,
+                'axes',
+                axes,
+                'infer_flags',
+                infer_flags,
+                *attrs,
+            )
 
     if not isinstance(starts, (list, tuple, Variable)):
         raise ValueError(
-            "Input starts must be an Variable, python list or tuple.")
+            "Input starts must be an Variable, python list or tuple."
+        )
     if not isinstance(ends, (list, tuple, Variable)):
         raise ValueError(
-            "Input ends must be an Variable, python list or tuple.")
+            "Input ends must be an Variable, python list or tuple."
+        )
 
     helper = LayerHelper('slice', **locals())
 
@@ -11642,11 +12309,11 @@ def slice(input, axes, starts, ends):
     # infer_flags
     attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
-    helper.append_op(type='slice',
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+        dtype=helper.input_dtype('input')
+    )
+    helper.append_op(
+        type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
 
     return out
 
@@ -11655,8 +12322,8 @@ def slice(input, axes, starts, ends):
 def strided_slice(input, axes, starts, ends, strides):
     """
     :alias_main: paddle.strided_slice
-	:alias: paddle.strided_slice,paddle.tensor.strided_slice,paddle.tensor.manipulation.strided_slice
-	:old_api: paddle.fluid.layers.strided_slice
+        :alias: paddle.strided_slice,paddle.tensor.strided_slice,paddle.tensor.manipulation.strided_slice
+        :old_api: paddle.fluid.layers.strided_slice
 
     This operator produces a slice of ``input`` along multiple axes. Similar to numpy:
     https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
@@ -11757,9 +12424,12 @@ def strided_slice(input, axes, starts, ends, strides):
 
     helper = LayerHelper('strided_slice', **locals())
 
-    check_variable_and_dtype(input, 'input',
-                             ['bool', 'float32', 'float64', 'int32', 'int64'],
-                             'strided_slice')
+    check_variable_and_dtype(
+        input,
+        'input',
+        ['bool', 'float32', 'float64', 'int32', 'int64'],
+        'strided_slice',
+    )
     check_type(axes, 'axes', (list, tuple), 'strided_slice')
     check_type(starts, 'starts', (list, tuple, Variable), 'strided_slice')
     check_type(ends, 'ends', (list, tuple, Variable), 'strided_slice')
@@ -11767,8 +12437,9 @@ def strided_slice(input, axes, starts, ends, strides):
 
     def check_list_elements_dtype(list_input, input_name):
         if isinstance(list_input, Variable):
-            check_dtype(list_input.dtype, input_name, ['int32'],
-                        'strided_slice')
+            check_dtype(
+                list_input.dtype, input_name, ['int32'], 'strided_slice'
+            )
         else:
             for i, var in enumerate(list_input):
                 var_name = input_name + '[' + str(i) + ']'
@@ -11787,7 +12458,7 @@ def strided_slice(input, axes, starts, ends, strides):
                 dim.stop_gradient = True
                 new_list_tensor.append(dim)
             else:
-                assert (isinstance(dim, int))
+                assert isinstance(dim, int)
                 temp_out = helper.create_variable_for_type_inference('int32')
                 fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
                 new_list_tensor.append(temp_out)
@@ -11804,7 +12475,7 @@ def strided_slice(input, axes, starts, ends, strides):
             'starts': starts,
             'ends': ends,
             'strides': strides,
-            'infer_flags': infer_flags
+            'infer_flags': infer_flags,
         }
     else:
         # starts
@@ -11859,11 +12530,11 @@ def strided_slice(input, axes, starts, ends, strides):
                 attrs['strides'] = strides
         attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
-    helper.append_op(type='strided_slice',
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+        dtype=helper.input_dtype('input')
+    )
+    helper.append_op(
+        type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
 
     return out
 
@@ -11871,8 +12542,8 @@ def strided_slice(input, axes, starts, ends, strides):
 def shape(input):
     """
     :alias_main: paddle.shape
-	:alias: paddle.shape,paddle.tensor.shape,paddle.tensor.attribute.shape
-	:old_api: paddle.fluid.layers.shape
+        :alias: paddle.shape,paddle.tensor.shape,paddle.tensor.attribute.shape
+        :old_api: paddle.fluid.layers.shape
 
     **Shape Layer**
 
@@ -11930,16 +12601,29 @@ def shape(input):
         out.stop_gradient = True
         return out
 
-    check_variable_and_dtype(input, 'input', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'shape')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'shape',
+    )
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(type='shape',
-                     inputs={'Input': input},
-                     outputs={'Out': out},
-                     stop_gradient=True)
+    helper.append_op(
+        type='shape',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        stop_gradient=True,
+    )
 
     return out
 
@@ -11987,7 +12671,7 @@ def size(input):
 
     Raises:
         TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
-    
+
     Examples:
         .. code-block:: python
 
@@ -12007,8 +12691,11 @@ def size(input):
         return _legacy_C_ops.size(input)
 
     check_variable_and_dtype(
-        input, 'input',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], "size")
+        input,
+        'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        "size",
+    )
     helper = LayerHelper('size', **locals())
     out = helper.create_variable_for_type_inference(dtype='int64')
     helper.append_op(type='size', inputs={'Input': input}, outputs={'Out': out})
@@ -12024,33 +12711,35 @@ def _elementwise_op(helper):
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
     check_variable_and_dtype(
-        x, 'x', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
-        op_type)
+        x,
+        'x',
+        ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+        op_type,
+    )
     check_variable_and_dtype(
-        y, 'y', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
-        op_type)
+        y,
+        'y',
+        ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+        op_type,
+    )
 
     axis = helper.kwargs.get('axis', -1)
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
     name = helper.kwargs.get('name', None)
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'axis': axis,
-                         'use_mkldnn': use_mkldnn
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs={'axis': axis, 'use_mkldnn': use_mkldnn},
+    )
     return helper.append_activation(out)
 
 
 def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
-    
+
     Putting scale and bias to the input Tensor as following:
 
     ``bias_after_scale`` is True:
@@ -12075,9 +12764,9 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         Tensor: Output tensor of scale operator, with shape and data type same as input.
 
     Examples:
-    
+
         .. code-block:: python
-            
+
             # scale as a float32 number
             import paddle
 
@@ -12100,15 +12789,33 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         return dygraph_utils._append_activation_in_dygraph(out)
     if _non_static_mode():
         _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
-        out = _legacy_C_ops.scale(x, 'scale', float(_scale), 'bias',
-                                  float(bias), 'bias_after_scale',
-                                  bias_after_scale)
+        out = _legacy_C_ops.scale(
+            x,
+            'scale',
+            float(_scale),
+            'bias',
+            float(bias),
+            'bias_after_scale',
+            bias_after_scale,
+        )
         return dygraph_utils._append_activation_in_dygraph(out)
 
-    check_variable_and_dtype(x, "x", [
-        'float16', 'uint16', 'float32', 'float64', 'int8', 'int16', 'int32',
-        'int64', 'uint8'
-    ], "scale")
+    check_variable_and_dtype(
+        x,
+        "x",
+        [
+            'float16',
+            'uint16',
+            'float32',
+            'float64',
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+        ],
+        "scale",
+    )
     inputs = {'X': [x]}
     attrs = {
         'bias': float(bias),
@@ -12121,91 +12828,90 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     helper = LayerHelper('scale', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='scale',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return helper.append_activation(out)
 
 
 def elementwise_add(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_add(x, y)
-        # z = x + y
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_add(x, y)
+            # z = x + y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # [3., 8., 6.]
+            print(z_value) # [3., 8., 6.]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_add(x, y, axis=1)
-        # z = x + y
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_add(x, y, axis=1)
+            # z = x + y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # z.shape=[2,3,4,5]
+            print(z_value) # z.shape=[2,3,4,5]
 
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = fluid.layers.elementwise_add(x, y, axis=3)
-        # z = x + y
+            def gen_data():
+                return {
+                    "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
+                    "y": np.random.randint(1, 5, size=[5]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[5], dtype='float32')
+            z = fluid.layers.elementwise_add(x, y, axis=3)
+            # z = x + y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value) # z.shape=[2,3,4,5]
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
+            print(z_value) # z.shape=[2,3,4,5]
 
     """
     if _non_static_mode():
@@ -12215,7 +12921,8 @@ Examples:
             axis=axis,
             act=act,
             op_name='elementwise_add',
-            use_mkldnn=_global_flags()["FLAGS_use_mkldnn"])
+            use_mkldnn=_global_flags()["FLAGS_use_mkldnn"],
+        )
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
@@ -12224,90 +12931,88 @@ Examples:
 def elementwise_div(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_div(x, y)
-        # z = x / y
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_div(x, y)
+            # z = x / y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # [2., 0.6, 2.]
+            print(z_value) # [2., 0.6, 2.]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_div(x, y, axis=1)
-        # z = x / y
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_div(x, y, axis=1)
+            # z = x / y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # z.shape=[2,3,4,5]
+            print(z_value) # z.shape=[2,3,4,5]
 
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = fluid.layers.elementwise_div(x, y, axis=3)
-        # z = x / y
+            def gen_data():
+                return {
+                    "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
+                    "y": np.random.randint(1, 5, size=[5]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[5], dtype='float32')
+            z = fluid.layers.elementwise_div(x, y, axis=3)
+            # z = x / y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value) # z.shape=[2,3,4,5]
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
+            print(z_value) # z.shape=[2,3,4,5]
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_div')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_div'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_div', **locals()))
 
@@ -12315,90 +13020,88 @@ Examples:
 def elementwise_sub(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_sub(x, y)
-        # z = x - y
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_sub(x, y)
+            # z = x - y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # [1., -2., 2.]
+            print(z_value) # [1., -2., 2.]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_sub(x, y, axis=1)
-        # z = x - y
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_sub(x, y, axis=1)
+            # z = x - y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # z.shape=[2,3,4,5]
+            print(z_value) # z.shape=[2,3,4,5]
 
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = fluid.layers.elementwise_sub(x, y, axis=3)
-        # z = x - y
+            def gen_data():
+                return {
+                    "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
+                    "y": np.random.randint(1, 5, size=[5]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[5], dtype='float32')
+            z = fluid.layers.elementwise_sub(x, y, axis=3)
+            # z = x - y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value) # z.shape=[2,3,4,5]
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
+            print(z_value) # z.shape=[2,3,4,5]
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_sub')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_sub'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
 
@@ -12407,222 +13110,216 @@ Examples:
 def elementwise_mul(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_mul(x, y)
-        # z = x * y
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_mul(x, y)
+            # z = x * y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # [2., 15., 8.]
+            print(z_value) # [2., 15., 8.]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_mul(x, y, axis=1)
-        # z = x * y
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_mul(x, y, axis=1)
+            # z = x * y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # z.shape=[2,3,4,5]
+            print(z_value) # z.shape=[2,3,4,5]
 
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = fluid.layers.elementwise_mul(x, y, axis=3)
-        # z = x * y
+            def gen_data():
+                return {
+                    "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
+                    "y": np.random.randint(1, 5, size=[5]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[5], dtype='float32')
+            z = fluid.layers.elementwise_mul(x, y, axis=3)
+            # z = x * y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value) # z.shape=[2,3,4,5]
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
+            print(z_value) # z.shape=[2,3,4,5]
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_mul')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_mul'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_mul', **locals()))
 
 
 def elementwise_max(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_max
-	:alias: paddle.elementwise_max,paddle.tensor.elementwise_max,paddle.tensor.math.elementwise_max
-	:old_api: paddle.fluid.layers.elementwise_max
+        :alias_main: paddle.elementwise_max
+            :alias: paddle.elementwise_max,paddle.tensor.elementwise_max,paddle.tensor.math.elementwise_max
+            :old_api: paddle.fluid.layers.elementwise_max
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_max(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_max(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[2, 5, 4]
+            print(z_value) #[2, 5, 4]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_max(x, y, axis=1)
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_max(x, y, axis=1)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value)#[[[[1., 1., 1., 1., 1.] .... [1., 1., 1., 1., 1.]]]]
+            print(z_value)#[[[[1., 1., 1., 1., 1.] .... [1., 1., 1., 1., 1.]]]]
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_max')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_max'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_max', **locals()))
 
 
 def elementwise_min(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_min
-	:alias: paddle.elementwise_min,paddle.tensor.elementwise_min,paddle.tensor.math.elementwise_min
-	:old_api: paddle.fluid.layers.elementwise_min
+        :alias_main: paddle.elementwise_min
+            :alias: paddle.elementwise_min,paddle.tensor.elementwise_min,paddle.tensor.math.elementwise_min
+            :old_api: paddle.fluid.layers.elementwise_min
 
-Examples:
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_min(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_min(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[1, 3, 2]
+            print(z_value) #[1, 3, 2]
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_min(x, y, axis=1)
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_min(x, y, axis=1)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value)#[[[[0., 0., 0., 0., 0.] .... [0., 0., 0., 0., 0.]]]]
+            print(z_value)#[[[[0., 0., 0., 0., 0.] .... [0., 0., 0., 0., 0.]]]]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_min')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_min'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_min', **locals()))
 
@@ -12630,37 +13327,35 @@ Examples:
 def elementwise_pow(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_pow(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_pow(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[2, 243, 16]
+            print(z_value) #[2, 243, 16]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_pow')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_pow'
+        )
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
@@ -12668,37 +13363,35 @@ Examples:
 def elementwise_mod(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([10, 15, 8]).astype('int32'),
-                "y": np.array([3, 6, 5]).astype('int32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='int32')
-        y = fluid.data(name="y", shape=[3], dtype='int32')
-        z = fluid.layers.elementwise_mod(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([10, 15, 8]).astype('int32'),
+                    "y": np.array([3, 6, 5]).astype('int32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='int32')
+            y = fluid.data(name="y", shape=[3], dtype='int32')
+            z = fluid.layers.elementwise_mod(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[1, 3, 3]
+            print(z_value) #[1, 3, 3]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_mod')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_mod'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
 
@@ -12707,78 +13400,89 @@ Examples:
 def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([10, 15, 8]).astype('int32'),
-                "y": np.array([3, 7, 5]).astype('int32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='int32')
-        y = fluid.data(name="y", shape=[3], dtype='int32')
-        z = fluid.layers.elementwise_floordiv(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([10, 15, 8]).astype('int32'),
+                    "y": np.array([3, 7, 5]).astype('int32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='int32')
+            y = fluid.data(name="y", shape=[3], dtype='int32')
+            z = fluid.layers.elementwise_floordiv(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[3, 2, 1]
+            print(z_value) #[3, 2, 1]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_floordiv')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_floordiv'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_floordiv', **locals()))
 
 
 for func in [
-        elementwise_add,
-        elementwise_div,
-        elementwise_sub,
-        elementwise_mul,
-        elementwise_max,
-        elementwise_pow,
-        elementwise_min,
-        elementwise_mod,
-        elementwise_floordiv,
+    elementwise_add,
+    elementwise_div,
+    elementwise_sub,
+    elementwise_mul,
+    elementwise_max,
+    elementwise_pow,
+    elementwise_min,
+    elementwise_mod,
+    elementwise_floordiv,
 ]:
     op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
 
     # insert the c++ doc string on top of python doc string
-    func.__doc__ = _generate_doc_string_(
-        op_proto,
-        additional_args_lines=[
-            "axis (int32, optional): If X.dimension != Y.dimension, \
+    func.__doc__ = (
+        _generate_doc_string_(
+            op_proto,
+            additional_args_lines=[
+                "axis (int32, optional): If X.dimension != Y.dimension, \
             Y.dimension must be a subsequence of x.dimension. \
             And axis is the start dimension index for broadcasting Y onto X. ",
-            "act (string, optional): Activation applied to the output. \
+                "act (string, optional): Activation applied to the output. \
             Default is None. Details: :ref:`api_guide_activations_en` ",
-            "name (string, optional): Name of the output. \
+                "name (string, optional): Name of the output. \
             Default is None. It's used to print debug info for developers. Details: \
-            :ref:`api_guide_Name` "
-        ],
-        skip_attrs_set={
-            "x_data_format", "y_data_format", "axis", "use_quantizer",
-            "mkldnn_data_type", "Scale_x", "Scale_y", "Scale_out"
-        }) + """\n""" + str(func.__doc__)
+            :ref:`api_guide_Name` ",
+            ],
+            skip_attrs_set={
+                "x_data_format",
+                "y_data_format",
+                "axis",
+                "use_quantizer",
+                "mkldnn_data_type",
+                "Scale_x",
+                "Scale_y",
+                "Scale_out",
+            },
+        )
+        + """\n"""
+        + str(func.__doc__)
+    )
 
     doc_list = func.__doc__.splitlines()
 
     for idx, val in enumerate(doc_list):
-        if val.startswith("Warning: ") and val.endswith(
-                " instead."
-        ) and "and will be removed in future versions." in val:
+        if (
+            val.startswith("Warning: ")
+            and val.endswith(" instead.")
+            and "and will be removed in future versions." in val
+        ):
             doc_list.insert(0, doc_list.pop(idx))
             func.__doc__ = "\n" + "\n".join(i for i in doc_list)
             break
@@ -12789,9 +13493,12 @@ for func in []:
         op_proto,
         additional_args_lines=[
             "act (basestring|None): Activation applied to the output.",
-            "name (basestring|None): Name of the output."
-        ])
-    func.__doc__ = func.__doc__ + """
+            "name (basestring|None): Name of the output.",
+        ],
+    )
+    func.__doc__ = (
+        func.__doc__
+        + """
 
 Examples:
   .. code-block:: python
@@ -12826,8 +13533,16 @@ Examples:
     x5 = fluid.layers.data(name="x5", shape=[2, 3, 4, 5], dtype='float32')
     y5 = fluid.layers.data(name="y5", shape=[2], dtype='float32')
     z5 = fluid.layers.%s(x5, y5, axis=0)
-    """ % (func.__name__, func.__name__, func.__name__, func.__name__,
-           func.__name__, func.__name__)
+    """
+        % (
+            func.__name__,
+            func.__name__,
+            func.__name__,
+            func.__name__,
+            func.__name__,
+            func.__name__,
+        )
+    )
 
 
 def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
@@ -12838,14 +13553,18 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
         else:
             return op(x)
     check_variable_and_dtype(
-        x, "x",
+        x,
+        "x",
         ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
-        op_name)
+        op_name,
+    )
     if y is not None:
         check_variable_and_dtype(
-            y, "y",
+            y,
+            "y",
             ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
-            op_name)
+            op_name,
+        )
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
@@ -12854,18 +13573,16 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
     if binary_op and x.dtype != y.dtype:
         raise ValueError(
             "(InvalidArgument) The DataType of %s Op's Variable must be consistent, but received %s and %s."
-            % (op_name, x.dtype, y.dtype))
+            % (op_name, x.dtype, y.dtype)
+        )
 
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(type=op_name,
-                         inputs={
-                             "X": x,
-                             "Y": y
-                         },
-                         outputs={"Out": out})
+        helper.append_op(
+            type=op_name, inputs={"X": x, "Y": y}, outputs={"Out": out}
+        )
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -12907,12 +13624,9 @@ def logical_and(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.logical_and(x, y)
 
-    return _logical_op(op_name="logical_and",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 def logical_or(x, y, out=None, name=None):
@@ -12927,7 +13641,7 @@ def logical_or(x, y, out=None, name=None):
 
     .. note::
         ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
-    
+
     Args:
         x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
@@ -12952,12 +13666,9 @@ def logical_or(x, y, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.logical_or(x, y)
-    return _logical_op(op_name="logical_or",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 def logical_xor(x, y, out=None, name=None):
@@ -12998,12 +13709,9 @@ def logical_xor(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.logical_xor(x, y)
 
-    return _logical_op(op_name="logical_xor",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
@@ -13036,18 +13744,15 @@ def logical_not(x, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.logical_not(x)
-    return _logical_op(op_name="logical_not",
-                       x=x,
-                       y=None,
-                       name=name,
-                       out=out,
-                       binary_op=False)
+    return _logical_op(
+        op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False
+    )
 
 
 @templatedoc()
 def clip(x, min, max, name=None):
     """
-	:old_api: paddle.fluid.layers.clip
+        :old_api: paddle.fluid.layers.clip
 
     ${comment}
 
@@ -13078,21 +13783,20 @@ def clip(x, min, max, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
 
     if name is None:
-        name = unique_name.generate_with_ignorable_key(".".join(
-            [helper.name, 'tmp']))
-
-    out = helper.create_variable(type=x.type,
-                                 name=name,
-                                 dtype=x.dtype,
-                                 persistable=False)
-
-    helper.append_op(type="clip",
-                     inputs={"X": x},
-                     attrs={
-                         "min": min,
-                         "max": max
-                     },
-                     outputs={"Out": out})
+        name = unique_name.generate_with_ignorable_key(
+            ".".join([helper.name, 'tmp'])
+        )
+
+    out = helper.create_variable(
+        type=x.type, name=name, dtype=x.dtype, persistable=False
+    )
+
+    helper.append_op(
+        type="clip",
+        inputs={"X": x},
+        attrs={"min": min, "max": max},
+        outputs={"Out": out},
+    )
 
     return out
 
@@ -13136,18 +13840,20 @@ def clip_by_norm(x, max_norm, name=None):
     check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
 
     if name is None:
-        name = unique_name.generate_with_ignorable_key(".".join(
-            [helper.name, 'tmp']))
+        name = unique_name.generate_with_ignorable_key(
+            ".".join([helper.name, 'tmp'])
+        )
 
-    out = helper.create_variable(type=x.type,
-                                 name=name,
-                                 dtype=x.dtype,
-                                 persistable=False)
+    out = helper.create_variable(
+        type=x.type, name=name, dtype=x.dtype, persistable=False
+    )
 
-    helper.append_op(type="clip_by_norm",
-                     inputs={"X": x},
-                     attrs={"max_norm": max_norm},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="clip_by_norm",
+        inputs={"X": x},
+        attrs={"max_norm": max_norm},
+        outputs={"Out": out},
+    )
 
     return out
 
@@ -13186,10 +13892,9 @@ def mean(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mean')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="mean",
-                     inputs={"X": x},
-                     attrs={},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="mean", inputs={"X": x}, attrs={}, outputs={"Out": out}
+    )
 
     return out
 
@@ -13221,10 +13926,12 @@ def merge_selected_rows(x, name=None):
 
     helper = LayerHelper("merge_selected_rows", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="merge_selected_rows",
-                     inputs={"X": x},
-                     attrs={},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="merge_selected_rows",
+        inputs={"X": x},
+        attrs={},
+        outputs={"Out": out},
+    )
     return out
 
 
@@ -13264,8 +13971,14 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
     """
     if _non_static_mode():
-        return _legacy_C_ops.mul(x, y, 'x_num_col_dims', x_num_col_dims,
-                                 'y_num_col_dims', y_num_col_dims)
+        return _legacy_C_ops.mul(
+            x,
+            y,
+            'x_num_col_dims',
+            x_num_col_dims,
+            'y_num_col_dims',
+            y_num_col_dims,
+        )
 
     inputs = {"X": [x], "Y": [y]}
     attrs = {"x_num_col_dims": x_num_col_dims, "y_num_col_dims": y_num_col_dims}
@@ -13274,13 +13987,9 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     check_variable_and_dtype(y, 'y', ['float16', 'float32', 'float64'], 'mul')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="mul",
-                     inputs={
-                         "X": x,
-                         "Y": y
-                     },
-                     attrs=attrs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="mul", inputs={"X": x, "Y": y}, attrs=attrs, outputs={"Out": out}
+    )
     return out
 
 
@@ -13410,24 +14119,27 @@ def space_to_depth(x, blocksize, name=None):
     if not (isinstance(blocksize, int)):
         raise ValueError("blocksize must be a python Int")
 
-    check_variable_and_dtype(x, 'x', \
-        ['float16', 'float32', 'float64', 'int32', 'int64'], 'space_to_depth')
+    check_variable_and_dtype(
+        x,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'space_to_depth',
+    )
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="space_to_depth",
-                     inputs={"X": x},
-                     attrs={"blocksize": blocksize},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="space_to_depth",
+        inputs={"X": x},
+        attrs={"blocksize": blocksize},
+        outputs={"Out": out},
+    )
     return out
 
 
-def affine_channel(x,
-                   scale=None,
-                   bias=None,
-                   data_layout='NCHW',
-                   name=None,
-                   act=None):
+def affine_channel(
+    x, scale=None, bias=None, data_layout='NCHW', name=None, act=None
+):
     """
 
     Applies a separate affine transformation to each channel of the input.
@@ -13494,14 +14206,12 @@ def affine_channel(x,
     check_type(bias, 'bias', (Variable, type(None)), 'affine_channel')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="affine_channel",
-                     inputs={
-                         "X": x,
-                         'Scale': scale,
-                         'Bias': bias
-                     },
-                     attrs={"data_layout": data_layout},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="affine_channel",
+        inputs={"X": x, 'Scale': scale, 'Bias': bias},
+        attrs={"data_layout": data_layout},
+        outputs={"Out": out},
+    )
     return helper.append_activation(out)
 
 
@@ -13600,8 +14310,9 @@ def similarity_focus(input, axis, indexes, name=None):
     """
     helper = LayerHelper('similarity_focus', **locals())
     # check attrs
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             "similarity_focus")
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], "similarity_focus"
+    )
     check_type(axis, 'axis', int, "similarity_focus")
     check_type(indexes, 'indexes', list, "similarity_focus")
     if axis != 1 and axis != 2 and axis != 3:
@@ -13610,13 +14321,12 @@ def similarity_focus(input, axis, indexes, name=None):
         raise ValueError("indexes can not be empty.")
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='similarity_focus',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         "axis": axis,
-                         "indexes": indexes
-                     })
+    helper.append_op(
+        type='similarity_focus',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={"axis": axis, "indexes": indexes},
+    )
     return out
 
 
@@ -13671,15 +14381,15 @@ def hash(input, hash_size, num_hash=1, name=None):
     check_type(hash_size, 'hash_size', int, 'hash')
     check_type(num_hash, 'num_hash', int, 'hash')
     helper = LayerHelper('hash', **locals())
-    out = helper.create_variable_for_type_inference(helper.input_dtype(),
-                                                    stop_gradient=True)
-    helper.append_op(type='hash',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'num_hash': num_hash,
-                         'mod_by': hash_size
-                     })
+    out = helper.create_variable_for_type_inference(
+        helper.input_dtype(), stop_gradient=True
+    )
+    helper.append_op(
+        type='hash',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'num_hash': num_hash, 'mod_by': hash_size},
+    )
     return out
 
 
@@ -13773,8 +14483,9 @@ def grid_sampler(x, grid, name=None):
     helper = LayerHelper("grid_sampler", **locals())
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
-    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
-                             'grid_sampler')
+    check_variable_and_dtype(
+        grid, 'grid', ['float32', 'float64'], 'grid_sampler'
+    )
     if not isinstance(x, Variable):
         return ValueError("The x should be a Variable")
 
@@ -13786,10 +14497,9 @@ def grid_sampler(x, grid, name=None):
 
     attrs = {'use_cudnn': False} if core.is_compiled_with_rocm() else {}
 
-    helper.append_op(type='grid_sampler',
-                     inputs=ipts,
-                     outputs={'Output': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='grid_sampler', inputs=ipts, outputs={'Output': out}, attrs=attrs
+    )
     return out
 
 
@@ -13882,33 +14592,30 @@ def add_position_encoding(input, alpha, beta, name=None):
 
     """
     if _non_static_mode():
-        return _legacy_C_ops.add_position_encoding(input, "alpha", alpha,
-                                                   "beta", beta)
+        return _legacy_C_ops.add_position_encoding(
+            input, "alpha", alpha, "beta", beta
+        )
 
     helper = LayerHelper('add_position_encoding', **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             "add_position_encoding")
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], "add_position_encoding"
+    )
     dtype = helper.input_dtype()
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(type="add_position_encoding",
-                     inputs={"X": input},
-                     outputs={"Out": out},
-                     attrs={
-                         "alpha": alpha,
-                         "beta": beta
-                     })
+    helper.append_op(
+        type="add_position_encoding",
+        inputs={"X": input},
+        outputs={"Out": out},
+        attrs={"alpha": alpha, "beta": beta},
+    )
     return out
 
 
-def bilinear_tensor_product(x,
-                            y,
-                            size,
-                            act=None,
-                            name=None,
-                            param_attr=None,
-                            bias_attr=None):
+def bilinear_tensor_product(
+    x, y, size, act=None, name=None, param_attr=None, bias_attr=None
+):
     r"""
     :api_attr: Static Graph
 
@@ -13959,23 +14666,21 @@ def bilinear_tensor_product(x,
 
     param_shape = [size, x.shape[1], y.shape[1]]
 
-    w = helper.create_parameter(attr=helper.param_attr,
-                                shape=param_shape,
-                                dtype=dtype,
-                                is_bias=False)
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False
+    )
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
     inputs = {"X": x, "Y": y, "Weight": w}
     if helper.bias_attr:
         bias_size = [1, size]
-        bias = helper.create_parameter(attr=helper.bias_attr,
-                                       shape=bias_size,
-                                       dtype=dtype,
-                                       is_bias=True)
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True
+        )
         inputs["Bias"] = bias
-    helper.append_op(type="bilinear_tensor_product",
-                     inputs=inputs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out}
+    )
 
     # add activation
     return helper.append_activation(out)
@@ -14025,10 +14730,12 @@ def get_tensor_from_selected_rows(x, name=None):
         )
     helper = LayerHelper('get_tensor_from_selected_rows', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='get_tensor_from_selected_rows',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={})
+    helper.append_op(
+        type='get_tensor_from_selected_rows',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={},
+    )
     return out
 
 
@@ -14098,10 +14805,12 @@ def shuffle_channel(x, group, name=None):
     if not isinstance(group, int):
         raise TypeError("group must be int type")
 
-    helper.append_op(type="shuffle_channel",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={"group": group})
+    helper.append_op(
+        type="shuffle_channel",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"group": group},
+    )
     return out
 
 
@@ -14139,8 +14848,9 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
             input = paddle.randn([6, 4, 2, 2])
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
-    return paddle.nn.functional.temporal_shift(x, seg_num, shift_ratio, name,
-                                               data_format)
+    return paddle.nn.functional.temporal_shift(
+        x, seg_num, shift_ratio, name, data_format
+    )
 
 
 class PyFuncRegistry(object):
@@ -14199,7 +14909,7 @@ class PyFuncRegistry(object):
             func_ret = self._func(*args[idx:], **kwargs)
 
         if not isinstance(func_ret, (list, tuple)):
-            func_ret = (func_ret, )
+            func_ret = (func_ret,)
 
         ret = []
         for each_ret in func_ret:
@@ -14415,11 +15125,13 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         out_list = out
     else:
         raise TypeError(
-            'Output must be Variable/list(Variable)/tuple(Variable)')
+            'Output must be Variable/list(Variable)/tuple(Variable)'
+        )
 
     fwd_func_id = PyFuncRegistry(func).id
-    bwd_func_id = PyFuncRegistry(
-        backward_func).id if backward_func is not None else -1
+    bwd_func_id = (
+        PyFuncRegistry(backward_func).id if backward_func is not None else -1
+    )
 
     for each_out in out_list:
         if len(each_out.shape) == 0:
@@ -14439,18 +15151,22 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         for v in skip_vars_in_backward_input:
             if not v.name in fwd_in_out:
                 raise ValueError(
-                    'Variable {} is not found in forward inputs and outputs'.
-                    format(v.name))
+                    'Variable {} is not found in forward inputs and outputs'.format(
+                        v.name
+                    )
+                )
             backward_skip_vars.add(v.name)
 
-    helper.append_op(type='py_func',
-                     inputs={'X': x},
-                     outputs={'Out': out_list},
-                     attrs={
-                         'forward_callable_id': fwd_func_id,
-                         'backward_callable_id': bwd_func_id,
-                         'backward_skip_vars': list(backward_skip_vars)
-                     })
+    helper.append_op(
+        type='py_func',
+        inputs={'X': x},
+        outputs={'Out': out_list},
+        attrs={
+            'forward_callable_id': fwd_func_id,
+            'backward_callable_id': bwd_func_id,
+            'backward_skip_vars': list(backward_skip_vars),
+        },
+    )
     return out
 
 
@@ -14460,13 +15176,15 @@ py_func.registered_func_num = PyFuncRegistry.registered_func_num
 
 
 @templatedoc()
-def psroi_pool(input,
-               rois,
-               output_channels,
-               spatial_scale,
-               pooled_height,
-               pooled_width,
-               name=None):
+def psroi_pool(
+    input,
+    rois,
+    output_channels,
+    spatial_scale,
+    pooled_height,
+    pooled_width,
+    name=None,
+):
     """
 
     ${comment}
@@ -14514,29 +15232,30 @@ def psroi_pool(input,
         raise TypeError("pooled_width must be int type")
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='psroi_pool',
-                     inputs={
-                         'X': input,
-                         'ROIs': rois
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'output_channels': output_channels,
-                         'spatial_scale': spatial_scale,
-                         'pooled_height': pooled_height,
-                         'pooled_width': pooled_width
-                     })
+    helper.append_op(
+        type='psroi_pool',
+        inputs={'X': input, 'ROIs': rois},
+        outputs={'Out': out},
+        attrs={
+            'output_channels': output_channels,
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width,
+        },
+    )
     return out
 
 
 @templatedoc()
-def prroi_pool(input,
-               rois,
-               spatial_scale=1.0,
-               pooled_height=1,
-               pooled_width=1,
-               batch_roi_nums=None,
-               name=None):
+def prroi_pool(
+    input,
+    rois,
+    spatial_scale=1.0,
+    pooled_height=1,
+    pooled_width=1,
+    batch_roi_nums=None,
+    name=None,
+):
     """
 
     The precise roi pooling implementation for paddle. Reference: https://arxiv.org/pdf/1807.11590.pdf
@@ -14599,14 +15318,16 @@ def prroi_pool(input,
     inputs_op = {'X': input, 'ROIs': rois}
     if batch_roi_nums is not None:
         inputs_op['BatchRoINums'] = batch_roi_nums
-    helper.append_op(type='prroi_pool',
-                     inputs=inputs_op,
-                     outputs={'Out': out},
-                     attrs={
-                         'spatial_scale': spatial_scale,
-                         'pooled_height': pooled_height,
-                         'pooled_width': pooled_width
-                     })
+    helper.append_op(
+        type='prroi_pool',
+        inputs=inputs_op,
+        outputs={'Out': out},
+        attrs={
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width,
+        },
+    )
     return out
 
 
@@ -14635,23 +15356,23 @@ def pixel_shuffle(x, upscale_factor):
     Examples:
         .. code-block:: python
 
-	    # declarative mode
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    input = fluid.data(name="input", shape=[2,9,4,4])
-	    output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            # declarative mode
+            import paddle.fluid as fluid
+            import numpy as np
+            input = fluid.data(name="input", shape=[2,9,4,4])
+            output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,9,4,4).astype("float32")
-	    output_data = exe.run(fluid.default_main_program(),
+            input_data = np.random.rand(2,9,4,4).astype("float32")
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
- 	    # print(output.shape)
-	    # (2L, 1L, 12L, 12L)
+            # print(output.shape)
+            # (2L, 1L, 12L, 12L)
 
     """
 
@@ -14663,10 +15384,12 @@ def pixel_shuffle(x, upscale_factor):
     if not isinstance(upscale_factor, int):
         raise TypeError("upscale factor must be int type")
 
-    helper.append_op(type="pixel_shuffle",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={"upscale_factor": upscale_factor})
+    helper.append_op(
+        type="pixel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"upscale_factor": upscale_factor},
+    )
     return out
 
 
@@ -14716,8 +15439,9 @@ def fsp_matrix(x, y):
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fsp_matrix')
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'fsp_matrix')
     helper = LayerHelper('fsp_matrix', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype(
-        input_param_name='x'))
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype(input_param_name='x')
+    )
     helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
     return out
 
@@ -14767,15 +15491,15 @@ def continuous_value_model(input, cvm, use_cvm=True):
     """
     helper = LayerHelper('cvm', **locals())
     out = helper.create_variable(dtype=input.dtype)
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'cvm')
-    helper.append_op(type='cvm',
-                     inputs={
-                         'X': [input],
-                         'CVM': [cvm]
-                     },
-                     outputs={'Y': [out]},
-                     attrs={"use_cvm": use_cvm})
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'cvm'
+    )
+    helper.append_op(
+        type='cvm',
+        inputs={'X': [input], 'CVM': [cvm]},
+        outputs={'Y': [out]},
+        attrs={"use_cvm": use_cvm},
+    )
     return out
 
 
@@ -14821,11 +15545,14 @@ def where(condition):
     helper = LayerHelper("where_index", **locals())
 
     out = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64)
-
-    helper.append_op(type='where_index',
-                     inputs={'Condition': condition},
-                     outputs={'Out': [out]})
+        dtype=core.VarDesc.VarType.INT64
+    )
+
+    helper.append_op(
+        type='where_index',
+        inputs={'Condition': condition},
+        outputs={'Out': [out]},
+    )
     return out
 
 
@@ -14884,21 +15611,21 @@ def unique(x, dtype='int32'):
              out, index = fluid.layers.unique(x) # out is [2, 3, 1, 5]; index is [0, 1, 1, 2, 3, 1]
     """
 
-    check_variable_and_dtype(x, "x", ['float32', 'float64', 'int32', 'int64'],
-                             "unique")
+    check_variable_and_dtype(
+        x, "x", ['float32', 'float64', 'int32', 'int64'], "unique"
+    )
     helper = LayerHelper("unique", **locals())
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     index = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type='unique',
-                     inputs={'X': x},
-                     attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
-                     outputs={
-                         'Out': [out],
-                         'Index': [index]
-                     })
+    helper.append_op(
+        type='unique',
+        inputs={'X': x},
+        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
+        outputs={'Out': [out], 'Index': [index]},
+    )
 
     return out, index
 
@@ -14931,11 +15658,13 @@ def unique_with_counts(x, dtype='int32'):
                                                         # count is [1, 3, 1, 1]
             # x.shape=(6,) out.shape=(4,), index.shape=(6,), count.shape=(4,)
     """
-    check_variable_and_dtype(x, "x", ['float32', 'float64', 'int32', 'int64'],
-                             "unique_with_counts")
+    check_variable_and_dtype(
+        x, "x", ['float32', 'float64', 'int32', 'int64'], "unique_with_counts"
+    )
     if not (dtype == 'int32' or dtype == 'int64'):
         raise TypeError(
-            "Op unique_with_counts, index dtype must be int32 or int64")
+            "Op unique_with_counts, index dtype must be int32 or int64"
+        )
 
     if x is None or len(x.shape) != 1:
         raise ValueError(
@@ -14950,33 +15679,33 @@ def unique_with_counts(x, dtype='int32'):
 
     count = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type='unique_with_counts',
-                     inputs={'X': x},
-                     attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
-                     outputs={
-                         'Out': [out],
-                         'Index': [index],
-                         'Count': [count]
-                     })
+    helper.append_op(
+        type='unique_with_counts',
+        inputs={'X': x},
+        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
+        outputs={'Out': [out], 'Index': [index], 'Count': [count]},
+    )
 
     return out, index, count
 
 
-def deformable_conv(input,
-                    offset,
-                    mask,
-                    num_filters,
-                    filter_size,
-                    stride=1,
-                    padding=0,
-                    dilation=1,
-                    groups=None,
-                    deformable_groups=None,
-                    im2col_step=None,
-                    param_attr=None,
-                    bias_attr=None,
-                    modulated=True,
-                    name=None):
+def deformable_conv(
+    input,
+    offset,
+    mask,
+    num_filters,
+    filter_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=None,
+    deformable_groups=None,
+    im2col_step=None,
+    param_attr=None,
+    bias_attr=None,
+    modulated=True,
+    name=None,
+):
     r"""
     :api_attr: Static Graph
 
@@ -15107,10 +15836,12 @@ def deformable_conv(input,
                                              num_filters=2, filter_size=filter_size, padding=1, modulated=False)
     """
 
-    check_variable_and_dtype(input, "input", ['float32', 'float64'],
-                             'deformable_conv')
-    check_variable_and_dtype(offset, "offset", ['float32', 'float64'],
-                             'deformable_conv')
+    check_variable_and_dtype(
+        input, "input", ['float32', 'float64'], 'deformable_conv'
+    )
+    check_variable_and_dtype(
+        offset, "offset", ['float32', 'float64'], 'deformable_conv'
+    )
     check_type(mask, 'mask', (Variable, type(None)), 'deformable_conv')
 
     num_channels = input.shape[1]
@@ -15145,52 +15876,58 @@ def deformable_conv(input,
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
                 " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num))
-        std = (2.0 / filter_elem_num)**0.5
+                "filter size.".format(filter_elem_num)
+            )
+        std = (2.0 / filter_elem_num) ** 0.5
         return Normal(0.0, std, 0)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        default_initializer=_get_default_param_initializer())
+        default_initializer=_get_default_param_initializer(),
+    )
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
     if modulated:
-        helper.append_op(type='deformable_conv',
-                         inputs={
-                             'Input': input,
-                             'Filter': filter_param,
-                             'Offset': offset,
-                             'Mask': mask,
-                         },
-                         outputs={"Output": pre_bias},
-                         attrs={
-                             'strides': stride,
-                             'paddings': padding,
-                             'dilations': dilation,
-                             'groups': groups,
-                             'deformable_groups': deformable_groups,
-                             'im2col_step': im2col_step,
-                         })
+        helper.append_op(
+            type='deformable_conv',
+            inputs={
+                'Input': input,
+                'Filter': filter_param,
+                'Offset': offset,
+                'Mask': mask,
+            },
+            outputs={"Output": pre_bias},
+            attrs={
+                'strides': stride,
+                'paddings': padding,
+                'dilations': dilation,
+                'groups': groups,
+                'deformable_groups': deformable_groups,
+                'im2col_step': im2col_step,
+            },
+        )
 
     else:
-        helper.append_op(type='deformable_conv_v1',
-                         inputs={
-                             'Input': input,
-                             'Filter': filter_param,
-                             'Offset': offset,
-                         },
-                         outputs={"Output": pre_bias},
-                         attrs={
-                             'strides': stride,
-                             'paddings': padding,
-                             'dilations': dilation,
-                             'groups': groups,
-                             'deformable_groups': deformable_groups,
-                             'im2col_step': im2col_step,
-                         })
+        helper.append_op(
+            type='deformable_conv_v1',
+            inputs={
+                'Input': input,
+                'Filter': filter_param,
+                'Offset': offset,
+            },
+            outputs={"Output": pre_bias},
+            attrs={
+                'strides': stride,
+                'paddings': padding,
+                'dilations': dilation,
+                'groups': groups,
+                'deformable_groups': deformable_groups,
+                'im2col_step': im2col_step,
+            },
+        )
 
     output = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
     return output
@@ -15266,23 +16003,26 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
             y = F.unfold(x, [3, 3], 1, 1, 1)
     """
 
-    return paddle.nn.functional.unfold(x, kernel_sizes, strides, paddings,
-                                       dilations, name)
-
-
-def deformable_roi_pooling(input,
-                           rois,
-                           trans,
-                           no_trans=False,
-                           spatial_scale=1.0,
-                           group_size=[1, 1],
-                           pooled_height=1,
-                           pooled_width=1,
-                           part_size=None,
-                           sample_per_part=1,
-                           trans_std=0.1,
-                           position_sensitive=False,
-                           name=None):
+    return paddle.nn.functional.unfold(
+        x, kernel_sizes, strides, paddings, dilations, name
+    )
+
+
+def deformable_roi_pooling(
+    input,
+    rois,
+    trans,
+    no_trans=False,
+    spatial_scale=1.0,
+    group_size=[1, 1],
+    pooled_height=1,
+    pooled_width=1,
+    part_size=None,
+    sample_per_part=1,
+    trans_std=0.1,
+    position_sensitive=False,
+    name=None,
+):
     r"""
 
     Deformable ROI Pooling Layer
@@ -15387,17 +16127,22 @@ def deformable_roi_pooling(input,
                                                 position_sensitive=False)
     """
 
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'deformable_roi_pooling')
-    check_variable_and_dtype(rois, 'rois', ['float32', 'float64'],
-                             'deformable_roi_pooling')
-    check_variable_and_dtype(trans, 'trans', ['float32', 'float64'],
-                             'deformable_roi_pooling')
-    check_type(group_size, 'group_size', (list, tuple),
-               'deformable_roi_pooling')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'deformable_roi_pooling'
+    )
+    check_variable_and_dtype(
+        rois, 'rois', ['float32', 'float64'], 'deformable_roi_pooling'
+    )
+    check_variable_and_dtype(
+        trans, 'trans', ['float32', 'float64'], 'deformable_roi_pooling'
+    )
+    check_type(
+        group_size, 'group_size', (list, tuple), 'deformable_roi_pooling'
+    )
     if part_size is not None:
-        check_type(part_size, 'part_size', (list, tuple),
-                   'deformable_roi_pooling')
+        check_type(
+            part_size, 'part_size', (list, tuple), 'deformable_roi_pooling'
+        )
 
     input_channels = input.shape[1]
     if position_sensitive == False:
@@ -15415,27 +16160,22 @@ def deformable_roi_pooling(input,
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
     top_count = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(type="deformable_psroi_pooling",
-                     inputs={
-                         "Input": input,
-                         "ROIs": rois,
-                         "Trans": trans
-                     },
-                     outputs={
-                         "Output": output,
-                         "TopCount": top_count
-                     },
-                     attrs={
-                         "no_trans": no_trans,
-                         "spatial_scale": spatial_scale,
-                         "output_dim": output_channels,
-                         "group_size": group_size,
-                         "pooled_height": pooled_height,
-                         "pooled_width": pooled_width,
-                         "part_size": part_size,
-                         "sample_per_part": sample_per_part,
-                         "trans_std": trans_std
-                     })
+    helper.append_op(
+        type="deformable_psroi_pooling",
+        inputs={"Input": input, "ROIs": rois, "Trans": trans},
+        outputs={"Output": output, "TopCount": top_count},
+        attrs={
+            "no_trans": no_trans,
+            "spatial_scale": spatial_scale,
+            "output_dim": output_channels,
+            "group_size": group_size,
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "part_size": part_size,
+            "sample_per_part": sample_per_part,
+            "trans_std": trans_std,
+        },
+    )
     return output
 
 
@@ -15458,7 +16198,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     For each value `v` in `input`, we reset it to a new value according to the
     following formula:
     ::
-   
+
         v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value
 
     That is, the value `v` is set to the new offset within the range represented by the shard `shard_id`
@@ -15487,27 +16227,31 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
             # [[-1], [1]]
     """
     if in_dygraph_mode():
-        return _C_ops.shard_index(input, index_num, nshards, shard_id,
-                                  ignore_value)
+        return _C_ops.shard_index(
+            input, index_num, nshards, shard_id, ignore_value
+        )
 
     check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
     op_type = 'shard_index'
     helper = LayerHelper(op_type, **locals())
     if shard_id < 0 or shard_id >= nshards:
-        raise ValueError('The shard_id(%d) should be in [0, %d)' %
-                         (shard_id, nshards))
+        raise ValueError(
+            'The shard_id(%d) should be in [0, %d)' % (shard_id, nshards)
+        )
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type=op_type,
-                     inputs={'X': [input]},
-                     outputs={'Out': out},
-                     attrs={
-                         'index_num': index_num,
-                         'nshards': nshards,
-                         'shard_id': shard_id,
-                         'ignore_value': ignore_value
-                     },
-                     stop_gradient=True)
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [input]},
+        outputs={'Out': out},
+        attrs={
+            'index_num': index_num,
+            'nshards': nshards,
+            'shard_id': shard_id,
+            'ignore_value': ignore_value,
+        },
+        stop_gradient=True,
+    )
     return out
 
 
@@ -15562,22 +16306,22 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
         print(out)  # [[0.66666667, 1.66666667,3., 4.]]
     """
     if _non_static_mode():
-        return _legacy_C_ops.hard_swish(x, 'threshold', threshold, 'scale',
-                                        scale, 'offset', offset)
+        return _legacy_C_ops.hard_swish(
+            x, 'threshold', threshold, 'scale', scale, 'offset', offset
+        )
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hard_swish')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hard_swish'
+    )
 
     helper = LayerHelper('hard_swish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='hard_swish',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'threshold': threshold,
-                         'scale': scale,
-                         'offset': offset
-                     })
+    helper.append_op(
+        type='hard_swish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold, 'scale': scale, 'offset': offset},
+    )
     return out
 
 
@@ -15648,15 +16392,20 @@ def mish(x, threshold=20, name=None):
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'mish')
     check_type(threshold, 'threshold', (float, int), 'mish')
-    assert threshold > 0, "threshold of mish should be greater than 0, " \
-                          "but got {}".format(threshold)
+    assert (
+        threshold > 0
+    ), "threshold of mish should be greater than 0, " "but got {}".format(
+        threshold
+    )
 
     helper = LayerHelper('mish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='mish',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='mish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
@@ -15726,12 +16475,9 @@ def gather_tree(ids, parents):
 
 @deprecated(since="2.0.0", update_to="paddle.uniform")
 @templatedoc()
-def uniform_random(shape,
-                   dtype='float32',
-                   min=-1.0,
-                   max=1.0,
-                   seed=0,
-                   name=None):
+def uniform_random(
+    shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None
+):
     """
     This OP returns a Tensor filled with random values sampled from a uniform
     distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
@@ -15811,34 +16557,47 @@ def uniform_random(shape,
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.uniform_random(shape, dtype, float(min), float(max), seed,
-                                     _current_expected_place())
+        return _C_ops.uniform_random(
+            shape,
+            dtype,
+            float(min),
+            float(max),
+            seed,
+            _current_expected_place(),
+        )
     elif _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _legacy_C_ops.uniform_random('shape',
-                                            shape, 'min', float(min), 'max',
-                                            float(max), 'seed', seed, 'dtype',
-                                            dtype)
+        return _legacy_C_ops.uniform_random(
+            'shape',
+            shape,
+            'min',
+            float(min),
+            'max',
+            float(max),
+            'seed',
+            seed,
+            'dtype',
+            dtype,
+        )
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(dtype, 'dtype', ('float32', 'float64', 'uint16'),
-                'uniform_random/rand')
+    check_dtype(
+        dtype, 'dtype', ('float32', 'float64', 'uint16'), 'uniform_random/rand'
+    )
     check_type(min, 'min', (float, int, Variable), 'uniform_random/rand')
     check_type(max, 'max', (float, int, Variable), 'uniform_random/rand')
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils.get_shape_tensor_inputs(inputs=inputs,
-                                  attrs=attrs,
-                                  shape=shape,
-                                  op_type='uniform_random/rand')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand'
+    )
 
     helper = LayerHelper("uniform_random", **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="uniform_random",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="uniform_random", inputs=inputs, attrs=attrs, outputs={"Out": out}
+    )
     utils.try_set_static_shape_tensor(out, shape)
     return out
 
@@ -15848,7 +16607,7 @@ def unbind(input, axis=0):
     Removes a tensor dimension, then split the input tensor into multiple sub-Tensors.
     Args:
         input (Variable): The input variable which is an N-D Tensor, data type being float32, float64, int32 or int64.
-       
+
         axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind. If :math:`axis < 0`, the
             dimension to unbind along is :math:`rank(input) + axis`. Default is 0.
     Returns:
@@ -15874,11 +16633,13 @@ def unbind(input, axis=0):
     helper = LayerHelper("unbind", **locals())
     check_type(input, 'input', (Variable), 'unbind')
     dtype = helper.input_dtype()
-    check_dtype(dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'],
-                'unbind')
+    check_dtype(
+        dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'], 'unbind'
+    )
     if not isinstance(axis, (int)):
-        raise TypeError("The type of 'axis'  must be int, but received %s." %
-                        (type(axis)))
+        raise TypeError(
+            "The type of 'axis'  must be int, but received %s." % (type(axis))
+        )
     if isinstance(axis, np.generic):
         axis = np.asscalar(axis)
     input_shape = input.shape
@@ -15889,8 +16650,10 @@ def unbind(input, axis=0):
         for i in range(num)
     ]
 
-    helper.append_op(type="unbind",
-                     inputs={"X": input},
-                     outputs={"Out": outs},
-                     attrs={"axis": axis})
+    helper.append_op(
+        type="unbind",
+        inputs={"X": input},
+        outputs={"Out": outs},
+        attrs={"axis": axis},
+    )
     return outs
diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py
index b397a863702b7a94fc59c49df1832eb50b134b2c..839b6e93e80a27043af3b99a4c44fe544238d5e2 100644
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -241,13 +241,13 @@ def send_ue_recv(
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
         dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
                             The available data type is int32, int64.
-        message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
-        reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
+        message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
+        reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
+        out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
                                     out_size is smaller or equal to 0, then this input will not be used.
                                     Otherwise, `out_size` should be equal with or larger than
-                                    max(dst_index) + 1.
+                                    max(dst_index) + 1. Default value is `None`.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
diff --git a/python/paddle/geometric/reindex.py b/python/paddle/geometric/reindex.py
index 94e9dbec4a5b2296cd034a2467b2f7fff6a2b64c..3b68931dfb99e7a08a9586844ab65d5eb78f51f4 100644
--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
@@ -26,6 +26,7 @@ def reindex_graph(
     x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
     """
+
     Reindex Graph API.
 
     This API is mainly used in Graph Learning domain, which should be used
@@ -49,12 +50,12 @@ def reindex_graph(
                             should be the same with `x`.
         count (Tensor): The neighbor count of the input nodes `x`. And the
                         data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                     and should be filled with -1. Only useful for gpu version.
                                     `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
@@ -69,6 +70,7 @@ def reindex_graph(
         .. code-block:: python
 
             import paddle
+
             x = [0, 1, 2]
             neighbors = [8, 9, 0, 4, 7, 6, 7]
             count = [2, 3, 2]
@@ -138,6 +140,7 @@ def reindex_heter_graph(
     x, neighbors, count, value_buffer=None, index_buffer=None, name=None
 ):
     """
+
     Reindex HeterGraph API.
 
     This API is mainly used in Graph Learning domain, which should be used
@@ -161,12 +164,12 @@ def reindex_heter_graph(
                                 The data type should be the same with `x`.
         count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
                             And the data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                     and should be filled with -1. Only useful for gpu version.
                                     `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
@@ -183,6 +186,7 @@ def reindex_heter_graph(
         .. code-block:: python
 
             import paddle
+
             x = [0, 1, 2]
             neighbors_a = [8, 9, 0, 4, 7, 6, 7]
             count_a = [2, 3, 2]
diff --git a/python/paddle/geometric/sampling/neighbors.py b/python/paddle/geometric/sampling/neighbors.py
index 2dd5a9fb27c6715d7064e9fd80f5f2271fe5ff69..a52570576b04c64f5e67ed0f8a540ff316bde245 100644
--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -32,6 +32,7 @@ def sample_neighbors(
     name=None,
 ):
     """
+
     Graph Sample Neighbors API.
 
     This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -52,16 +53,16 @@ def sample_neighbors(
                          The data type should be the same with `row`.
         input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                               data type should be the same with `row`.
-        sample_size (int): The number of neighbors we need to sample. Default value is -1,
+        sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
                            which means returning all the neighbors of the input nodes.
-        eids (Tensor): The eid information of the input graph. If return_eids is True,
+        eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
                             then `eids` should not be None. The data type should be the
                             same with `row`. Default is None.
-        return_eids (bool): Whether to return eid information of sample edges. Default is False.
-        perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
+        return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
+        perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
                               is True, then `perm_buffer` should not be None. The data type should
                               be the same with `row`. If not None, we will use fiser-yates sampling
-                              to speed up. Only useful for gpu version.
+                              to speed up. Only useful for gpu version. Default is None.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
@@ -69,15 +70,16 @@ def sample_neighbors(
         - out_neighbors (Tensor), the sample neighbors of the input nodes.
 
         - out_count (Tensor), the number of sampling neighbors of each input node, and the shape
-                              should be the same with `input_nodes`.
+          should be the same with `input_nodes`.
 
         - out_eids (Tensor), if `return_eids` is True, we will return the eid information of the
-                             sample edges.
+          sample edges.
 
     Examples:
         .. code-block:: python
 
             import paddle
+
             # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
             #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
             row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 8dc5b347e2bacb55b7c7b33365eecc4bbbd45d05..cea4951d8ef68373d304a4acdd3dd8f712b0d2f1 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -69,8 +69,9 @@ def to_list(value):
 
 
 def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase,
-                            fluid.core.eager.Tensor)), "not a variable"
+    assert isinstance(
+        var, (Variable, fluid.core.VarBase, fluid.core.eager.Tensor)
+    ), "not a variable"
     if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
         return var.numpy()
     t = global_scope().find_var(var.name).get_tensor()
@@ -105,10 +106,9 @@ def extract_args(func):
 
 
 def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(x,
-                                   nranks,
-                                   ring_id=ring_id,
-                                   use_calc_stream=use_calc_stream)
+    return collective._c_allgather(
+        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream
+    )
 
 
 def wait_server_ready(endpoints):
@@ -119,7 +119,8 @@ def wait_server_ready(endpoints):
         for ep in endpoints:
             ip_port = ep.split(":")
             with contextlib.closing(
-                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as sock:
                 sock.settimeout(2)
                 result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                 if result != 0:
@@ -131,8 +132,9 @@ def wait_server_ready(endpoints):
             break
 
 
-def init_communicator(program, rank, nranks, wait_port, current_endpoint,
-                      endpoints):
+def init_communicator(
+    program, rank, nranks, wait_port, current_endpoint, endpoints
+):
     if nranks < 2:
         return
     other_endpoints = endpoints[:]
@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
         nccl_id_var = block.create_var(
             name=fluid.unique_name.generate('nccl_id'),
             persistable=True,
-            type=fluid.core.VarDesc.VarType.RAW)
-
-        block.append_op(type='c_gen_nccl_id',
-                        inputs={},
-                        outputs={'Out': nccl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-
-        block.append_op(type='c_comm_init',
-                        inputs={'X': nccl_id_var},
-                        outputs={},
-                        attrs={
-                            'nranks': nranks,
-                            'rank': rank,
-                            'ring_id': 0,
-                        })
+            type=fluid.core.VarDesc.VarType.RAW,
+        )
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': 0,
+            },
+        )
     elif core.is_compiled_with_npu():
         hccl_id_var = block.create_var(
             name=fluid.unique_name.generate('hccl_id'),
             persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(type='c_gen_hccl_id',
-                        inputs={},
-                        outputs={'Out': hccl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-        block.append_op(type='c_comm_init_hccl',
-                        inputs={'X': hccl_id_var},
-                        outputs={},
-                        attrs={
-                            'rank': rank,
-                            'ring_id': 0,
-                            'device_id': int(os.getenv("FLAGS_selected_npus")),
-                            'rank_ids': nranks
-                        })
+            type=core.VarDesc.VarType.RAW,
+        )
+        block.append_op(
+            type='c_gen_hccl_id',
+            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
+            outputs={},
+            attrs={
+                'rank': rank,
+                'ring_id': 0,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks,
+            },
+        )
 
 
 def prepare_distributed_context(place=None):
     if place is None:
-        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
+        place = (
+            fluid.CUDAPlace(ParallelEnv().dev_id)
+            if ParallelEnv().nranks > 1
             else fluid.CUDAPlace(0)
+        )
 
     place = _get_paddle_place(place)
     strategy = fluid.dygraph.parallel.ParallelStrategy()
@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None):
 
         def _init_context():
             communicator_prog = fluid.Program()
-            init_communicator(communicator_prog, strategy.local_rank,
-                              strategy.nranks, True, strategy.current_endpoint,
-                              strategy.trainer_endpoints)
+            init_communicator(
+                communicator_prog,
+                strategy.local_rank,
+                strategy.nranks,
+                True,
+                strategy.current_endpoint,
+                strategy.trainer_endpoints,
+            )
             exe = fluid.Executor(place)
             exe.run(communicator_prog)
 
@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None):
             fluid.enable_dygraph(place)
 
     else:
-        assert ("Only support CUDAPlace for now.")
+        assert "Only support CUDAPlace for now."
 
     _parallel_context_initialized = True
     return strategy
@@ -246,7 +266,9 @@ def _update_input_info(inputs):
 
 class StaticGraphAdapter(object):
     """
+
     Model traning/inference with a static graph.
+
     """
 
     def __init__(self, model):
@@ -269,7 +291,7 @@ class StaticGraphAdapter(object):
             'eval_total': 0,
             'test_total': 0,
             'eval_batch': 0,
-            'test_batch': 0
+            'test_batch': 0,
         }
 
         self._nranks = ParallelEnv().nranks
@@ -289,10 +311,13 @@ class StaticGraphAdapter(object):
         self.model.mode = value
 
     def train_batch(self, inputs, labels=None, update=True):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
+        assert (
+            self.model._optimizer
+        ), "model not ready, please call `model.prepare()` first"
         self.mode = 'train'
-        assert update is True, "Does not support `update == False` in static mode by now."
+        assert (
+            update is True
+        ), "Does not support `update == False` in static mode by now."
         return self._run(inputs, labels)
 
     def eval_batch(self, inputs, labels=None):
@@ -307,7 +332,6 @@ class StaticGraphAdapter(object):
         return self.model.network.parameters(*args, **kwargs)
 
     def save(self, path):
-
         def _save(state, path):
             if not state:
                 return
@@ -331,8 +355,7 @@ class StaticGraphAdapter(object):
         # XXX `optimizer.state_dict()` only work in dygraph mode
         optim_path = path + ".pdopt"
         optim = {
-            p.name: p
-            for p in filter(is_belong_to_optimizer, prog.list_vars())
+            p.name: p for p in filter(is_belong_to_optimizer, prog.list_vars())
         }
         if not optim:
             return
@@ -348,8 +371,10 @@ class StaticGraphAdapter(object):
 
         # restore parameter states
         fluid.core._create_loaded_parameter(
-            [param for param, state in param_state_pairs], global_scope(),
-            executor)
+            [param for param, state in param_state_pairs],
+            global_scope(),
+            executor,
+        )
         for param, state in param_state_pairs:
             self._set_var(param, state)
 
@@ -377,9 +402,10 @@ class StaticGraphAdapter(object):
                 # static-graph, since the time of global_step to increase is
                 # different.
                 state_val = (
-                    np.array(converted_state.pop("global_step")) - 1
-                ) if "global_step" in converted_state else converted_state.pop(
-                    "@LR_DECAY_COUNTER@", None)
+                    (np.array(converted_state.pop("global_step")) - 1)
+                    if "global_step" in converted_state
+                    else converted_state.pop("@LR_DECAY_COUNTER@", None)
+                )
                 if state_val is not None:
                     converted_state[var.name] = state_val
             elif var.name.startswith("learning_rate_"):
@@ -396,36 +422,61 @@ class StaticGraphAdapter(object):
                     opt_cls_name = self.model._optimizer.__class__.__name__
                     opt_unq_name = None
                     for name in self.model._optimizer._accumulators.keys():
-                        accum_name = name if opt_name is None else name[
-                            len(opt_name) + 1:]
-                        for param_name, state_var in self.model._optimizer._accumulators[
-                                name].items():
+                        accum_name = (
+                            name
+                            if opt_name is None
+                            else name[len(opt_name) + 1 :]
+                        )
+                        for (
+                            param_name,
+                            state_var,
+                        ) in self.model._optimizer._accumulators[name].items():
                             if opt_unq_name is None:
                                 # can not infer out the exact unique(opt_name),
                                 # thus try to extract rather than generate
-                                for state_key in sorted(state.keys(),
-                                                        key=lambda x: len(x),
-                                                        reverse=True):
-                                    prefix = param_name + "_" + (
-                                        opt_cls_name
-                                        if opt_name is None else opt_name) + "_"
+                                for state_key in sorted(
+                                    state.keys(),
+                                    key=lambda x: len(x),
+                                    reverse=True,
+                                ):
+                                    prefix = (
+                                        param_name
+                                        + "_"
+                                        + (
+                                            opt_cls_name
+                                            if opt_name is None
+                                            else opt_name
+                                        )
+                                        + "_"
+                                    )
                                     if state_key.startswith(prefix):
-                                        prefix_offset = state_key[len(
-                                            prefix):].find("_") + len(prefix)
+                                        prefix_offset = state_key[
+                                            len(prefix) :
+                                        ].find("_") + len(prefix)
                                         opt_unq_name = state_key[
-                                            len(param_name + "_"):prefix_offset]
+                                            len(
+                                                param_name + "_"
+                                            ) : prefix_offset
+                                        ]
                                         # TODO: assert
                                         # assert opt_unq_name is None
                                     # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
                                     # always end with "_0" since the unique optimizer._name
-                            dy_state_name = (param_name + "_" + opt_unq_name +
-                                             "_" + accum_name + "_0")
+                            dy_state_name = (
+                                param_name
+                                + "_"
+                                + opt_unq_name
+                                + "_"
+                                + accum_name
+                                + "_0"
+                            )
                             converted_state[
-                                state_var.name] = converted_state.pop(
-                                    dy_state_name)
+                                state_var.name
+                            ] = converted_state.pop(dy_state_name)
 
-            assert var.name in converted_state, \
-                "variable [{}] is not in optimizer state file".format(var.name)
+            assert (
+                var.name in converted_state
+            ), "variable [{}] is not in optimizer state file".format(var.name)
             self._set_var(var, converted_state[var.name])
 
     def _set_var(self, var, ndarray):
@@ -444,15 +495,17 @@ class StaticGraphAdapter(object):
 
     def _run(self, inputs, labels=None):
         compiled_prog = self._compiled_progs.get(self.mode, None)
-        assert compiled_prog, \
-            "Model is not ready, please call `model.prepare()` first"
+        assert (
+            compiled_prog
+        ), "Model is not ready, please call `model.prepare()` first"
 
         inputs = to_list(inputs)
         if labels is not None:
             labels = to_list(labels)
-        assert len(inputs) == len(self._input_vars[self.mode]), \
-            "number of inputs" \
+        assert len(inputs) == len(self._input_vars[self.mode]), (
+            "number of inputs"
             + " does not match number of arguments of `forward` method"
+        )
 
         feed = {}
         input_names = [v.name for v in self._input_vars[self.mode]]
@@ -462,8 +515,10 @@ class StaticGraphAdapter(object):
             # train and test may take different arguments
             if inputs[idx] is not None:
                 feed[n] = inputs[idx]
-            if self._amp_level == 'O2' and input_dtypes[
-                    idx] == core.VarDesc.VarType.FP16:
+            if (
+                self._amp_level == 'O2'
+                and input_dtypes[idx] == core.VarDesc.VarType.FP16
+            ):
                 if isinstance(feed[n], core.LoDTensor):
                     feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
                 elif isinstance(feed[n], np.array):
@@ -491,10 +546,12 @@ class StaticGraphAdapter(object):
             else:
                 pruned_fetch_list.append(fetch_var)
 
-        rets = self._executor.run(compiled_prog,
-                                  feed=feed,
-                                  fetch_list=pruned_fetch_list,
-                                  return_numpy=False)
+        rets = self._executor.run(
+            compiled_prog,
+            feed=feed,
+            fetch_list=pruned_fetch_list,
+            return_numpy=False,
+        )
 
         # restore pruned fetch_list Variable from feeds
         for i, name in enumerate(pruned_fetch_idx_name_map):
@@ -510,20 +567,24 @@ class StaticGraphAdapter(object):
         metrics = []
         for metric, state in zip(self.model._metrics, metric_states):
             # cut off padding size
-            if self.mode != 'train' and self.model._test_dataloader is not None \
-                    and isinstance(self.model._test_dataloader, DataLoader) \
-                    and self._nranks > 1:
+            if (
+                self.mode != 'train'
+                and self.model._test_dataloader is not None
+                and isinstance(self.model._test_dataloader, DataLoader)
+                and self._nranks > 1
+            ):
                 total_size = len(self.model._test_dataloader.dataset)
                 # TODO: fixme if have better way to get batch size
                 samples = state[0].shape[0]
                 current_count = self._merge_count.get(self.mode + '_total', 0)
                 if current_count + samples >= total_size:
                     state = [
-                        s[:int(total_size - current_count), ...] for s in state
+                        s[: int(total_size - current_count), ...] for s in state
                     ]
                     self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode + '_batch'] = int(total_size -
-                                                                  current_count)
+                    self._merge_count[self.mode + '_batch'] = int(
+                        total_size - current_count
+                    )
                 else:
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
@@ -555,8 +616,11 @@ class StaticGraphAdapter(object):
         if mode != 'train':
             for op in list(prog.global_block().ops):
                 prog.global_block()._remove_op(0)
-        if mode == 'train' and self.model._optimizer \
-                and self.model._optimizer._learning_rate_map:
+        if (
+            mode == 'train'
+            and self.model._optimizer
+            and self.model._optimizer._learning_rate_map
+        ):
             # HACK workaround learning rate map issue
             lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
             new_lr_var = prog.global_block().vars[lr_var.name]
@@ -594,20 +658,27 @@ class StaticGraphAdapter(object):
                         dist_strategy.amp = True
                         dist_strategy.amp_configs = self._amp_configs.copy()
                         dist_strategy.amp_configs.update(self._amp_custom_lists)
-                        dist_strategy.amp_configs[
-                            'use_pure_fp16'] = self._amp_level == 'O2'
+                        dist_strategy.amp_configs['use_pure_fp16'] = (
+                            self._amp_level == 'O2'
+                        )
                     self.model._optimizer = fleet.distributed_optimizer(
-                        self.model._optimizer, strategy=dist_strategy)
+                        self.model._optimizer, strategy=dist_strategy
+                    )
                 elif self._amp_level != "O0" and core.is_compiled_with_cuda:
-                    amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
-                        **self._amp_custom_lists
-                    ) if self._amp_custom_lists else None
+                    amp_lists = (
+                        paddle.static.amp.AutoMixedPrecisionLists(
+                            **self._amp_custom_lists
+                        )
+                        if self._amp_custom_lists
+                        else None
+                    )
                     self.model._optimizer = paddle.static.amp.decorate(
                         self.model._optimizer,
                         amp_lists=amp_lists,
                         use_pure_fp16=self._amp_level == "O2",
                         use_fp16_guard=self._use_fp16_guard,
-                        **self._amp_configs)
+                        **self._amp_configs
+                    )
 
                 self.model._optimizer.minimize(self._loss_endpoint)
 
@@ -620,7 +691,7 @@ class StaticGraphAdapter(object):
         self._endpoints[mode] = {
             "output": outputs,
             "loss": to_list(losses),
-            "metric": metrics
+            "metric": metrics,
         }
 
     def _compile_and_initialize(self, prog, mode):
@@ -628,8 +699,9 @@ class StaticGraphAdapter(object):
         if compiled_prog is not None:
             return compiled_prog
 
-        assert self.model._place is not None, \
-            "device is not set, please call `model.prepare()` first"
+        assert (
+            self.model._place is not None
+        ), "device is not set, please call `model.prepare()` first"
 
         place = self.model._place
 
@@ -642,8 +714,11 @@ class StaticGraphAdapter(object):
             uninitialized = []
             for var_py in self._startup_prog.list_vars():
                 var = fluid.global_scope().find_var(var_py.name)
-                if not var_py.name.startswith('nccl_id') and var and \
-                        var.get_tensor()._is_initialized():
+                if (
+                    not var_py.name.startswith('nccl_id')
+                    and var
+                    and var.get_tensor()._is_initialized()
+                ):
                     continue
 
                 uninitialized.append(var_py)
@@ -651,7 +726,10 @@ class StaticGraphAdapter(object):
                 startup_prog = self._startup_prog._prune(uninitialized)
                 self._executor.run(startup_prog)
 
-        if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
+        if (
+            self._amp_level == "O2"
+            and mode == 'train'
+            and core.is_compiled_with_cuda()
         ):
             self.model._optimizer.amp_init(place)
 
@@ -664,7 +742,6 @@ class StaticGraphAdapter(object):
 
 
 class DynamicGraphAdapter(object):
-
     def __init__(self, model):
         super(DynamicGraphAdapter, self).__init__()
         self.model = model
@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object):
             'eval_total': 0,
             'test_total': 0,
             'eval_batch': 0,
-            'test_batch': 0
+            'test_batch': 0,
         }
 
         self._input_info = None
@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object):
             stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
             stradegy.current_endpoint = ParallelEnv().current_endpoint
             self.ddp_model = fluid.dygraph.parallel.DataParallel(
-                self.model.network, stradegy)
+                self.model.network, stradegy
+            )
 
     @property
     def mode(self):
@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object):
 
     # TODO multi device in dygraph mode not implemented at present time
     def train_batch(self, inputs, labels=None, update=True):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
+        assert (
+            self.model._optimizer
+        ), "model not ready, please call `model.prepare()` first"
         self.model.network.train()
         self.mode = 'train'
         inputs = to_list(inputs)
@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object):
         if self._amp_level != "O0" and self.model._scaler is None:
             self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
 
-        with paddle.amp.auto_cast(enable=self._amp_level != 'O0',
-                                  **self._amp_custom_lists,
-                                  level=self._amp_level):
+        with paddle.amp.auto_cast(
+            enable=self._amp_level != 'O0',
+            **self._amp_custom_lists,
+            level=self._amp_level
+        ):
             if self._nranks > 1:
                 outputs = self.ddp_model(*[to_variable(x) for x in inputs])
             else:
@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object):
             m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
-        return ([to_numpy(l) for l in losses], metrics) \
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
+        return (
+            ([to_numpy(l) for l in losses], metrics)
+            if len(metrics) > 0
+            else [to_numpy(l) for l in losses]
+        )
 
     def eval_batch(self, inputs, labels=None):
         self.model.network.eval()
@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object):
         metrics = []
         for metric in self.model._metrics:
             # cut off padding value.
-            if self.model._test_dataloader is not None and self._nranks > 1 \
-                    and isinstance(self.model._test_dataloader, DataLoader):
+            if (
+                self.model._test_dataloader is not None
+                and self._nranks > 1
+                and isinstance(self.model._test_dataloader, DataLoader)
+            ):
                 total_size = len(self.model._test_dataloader.dataset)
                 samples = outputs[0].shape[0]
                 current_count = self._merge_count.get(self.mode + '_total', 0)
                 if current_count + samples >= total_size:
                     outputs = [
-                        o[:int(total_size - current_count)] for o in outputs
+                        o[: int(total_size - current_count)] for o in outputs
                     ]
                     labels = [
-                        l[:int(total_size - current_count)] for l in labels
+                        l[: int(total_size - current_count)] for l in labels
                     ]
                     self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode + '_batch'] = int(total_size -
-                                                                  current_count)
+                    self._merge_count[self.mode + '_batch'] = int(
+                        total_size - current_count
+                    )
                 else:
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object):
             opt_unq_name = ''
 
         opt_cls_name = self.model._optimizer.__class__.__name__
-        opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
+        opt_name = opt_unq_name[: opt_unq_name.rfind("_")]  # remove suffix idx
         param_names = [param.name for param in self.model.network.parameters()]
-        for var_name, state_var in sorted(optim_state.items(),
-                                          key=lambda x: len(x[0]),
-                                          reverse=True):
+        for var_name, state_var in sorted(
+            optim_state.items(), key=lambda x: len(x[0]), reverse=True
+        ):
             if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
                 # NOTE: dygraph saved global_step is 1 larger than that in
                 # static-graph, since the time of global_step to increase is
                 # different.
                 if var_name == "@LR_DECAY_COUNTER@":
-                    converted_state["global_step"] = np.array(
-                        converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+                    converted_state["global_step"] = (
+                        np.array(converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+                    )
             else:
                 # moment and other accumulators
                 # extend state dict to include promising dygraph names
                 for param_name in param_names:
                     if var_name.startswith(param_name + "_" + opt_name):
                         # when init optimizer with name
-                        accum_name = var_name[len(param_name + "_" + opt_name +
-                                                  "_"):]
-                    elif var_name.startswith(param_name +
-                                             "_") and opt_name == opt_cls_name:
+                        accum_name = var_name[
+                            len(param_name + "_" + opt_name + "_") :
+                        ]
+                    elif (
+                        var_name.startswith(param_name + "_")
+                        and opt_name == opt_cls_name
+                    ):
                         # when init optimizer without name
-                        accum_name = var_name[len(param_name + "_"):]
+                        accum_name = var_name[len(param_name + "_") :]
                     else:
                         continue
                     # remove suffix idx
-                    accum_name = accum_name[:accum_name.rfind("_")]
+                    accum_name = accum_name[: accum_name.rfind("_")]
                     # state names always end with "_0" in dygraph because of the
                     # unique optimizer._name
-                    dy_state_name = (param_name + "_" + opt_unq_name + "_" +
-                                     accum_name + "_0")
+                    dy_state_name = (
+                        param_name
+                        + "_"
+                        + opt_unq_name
+                        + "_"
+                        + accum_name
+                        + "_0"
+                    )
                     converted_state[dy_state_name] = state_var
 
         if not hasattr(self.model._optimizer, 'set_state_dict'):
@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object):
             self.model._optimizer.set_state_dict(converted_state)
 
     def prepare(self):
-        if self._amp_level == "O2" and self.model.mode == 'train' and core.is_compiled_with_cuda(
+        if (
+            self._amp_level == "O2"
+            and self.model.mode == 'train'
+            and core.is_compiled_with_cuda()
         ):
             self.model.network, self.model._optimizer = paddle.amp.decorate(
                 models=self.model.network,
                 optimizers=self.model._optimizer,
-                level='O2')
+                level='O2',
+            )
         if self._amp_level != "O0":
             self.model._scaler = None
 
 
 class Model(object):
     """
+
     An Model object is network with training and inference features.
     Dynamic graph and static graph are supported at the same time,
     switched by `paddle.enable_static()`. The usage is as follows.
@@ -920,7 +1023,7 @@ class Model(object):
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
     must be required for static graph.
 
-    When training on GPU, auto mixed precision (AMP O1) and pure float16 
+    When training on GPU, auto mixed precision (AMP O1) and pure float16
     (AMP O2) training are both supported in static mode and dynamic mode.
     In static graph mode, before training with pure float16 (AMP O2),
     `multi_precision` could be set to True when creating optimizer, which can
@@ -965,7 +1068,7 @@ class Model(object):
             # inputs and labels are not required for dynamic graph.
             input = InputSpec([None, 784], 'float32', 'x')
             label = InputSpec([None, 1], 'int64', 'label')
-            
+
             model = paddle.Model(net, input, label)
             optim = paddle.optimizer.SGD(learning_rate=1e-3,
                 parameters=model.parameters())
@@ -1053,16 +1156,17 @@ class Model(object):
 
     def train_batch(self, inputs, labels=None, update=True):
         """
+
         Run one training step on one batch of data. And using `update` indicates
         whether optimizer update gradients computing by this batch.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
-                (in case the model has multiple labels). If has no labels, 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
+                (in case the model has multiple labels). If has no labels,
                 set None. Default: None.
             update (bool, optional): Whether update parameters after loss.backward() computing.
                 Set it to False to accumulate gradients. Default: True.
@@ -1075,7 +1179,7 @@ class Model(object):
         Examples:
 
             .. code-block:: python
-            
+
                 import paddle
                 import paddle.nn as nn
                 from paddle.static import InputSpec
@@ -1098,6 +1202,7 @@ class Model(object):
                 loss = model.train_batch([data], [label])
                 print(loss)
                 # [array([2.192784], dtype=float32)]
+
         """
         loss = self._adapter.train_batch(inputs, labels, update)
         if fluid._non_static_mode() and self._input_info is None:
@@ -1107,15 +1212,16 @@ class Model(object):
     @no_grad()
     def eval_batch(self, inputs, labels=None):
         """
+
         Run one evaluating step on a batch of data.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
-                (in case the model has multiple labels). If has no labels, 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
+                (in case the model has multiple labels). If has no labels,
                 set None. Default: None.
 
         Returns:
@@ -1150,6 +1256,7 @@ class Model(object):
                 loss, acc = model.eval_batch([data], [label])
                 print(loss, acc)
                 # [array([2.8825705], dtype=float32)] [0.0]
+
         """
         loss = self._adapter.eval_batch(inputs, labels)
         if fluid._non_static_mode() and self._input_info is None:
@@ -1159,11 +1266,12 @@ class Model(object):
     @no_grad()
     def predict_batch(self, inputs):
         """
+
         Run one predicting step on a batch of data.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
 
         Returns:
@@ -1179,7 +1287,7 @@ class Model(object):
                 from paddle.static import InputSpec
 
                 device = paddle.set_device('cpu') # or 'gpu'
-                
+
                 input = InputSpec([None, 784], 'float32', 'x')
                 label = InputSpec([None, 1], 'int64', 'label')
 
@@ -1197,6 +1305,7 @@ class Model(object):
                 # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
                 #          0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
                 #          dtype=float32)]
+
         """
         loss = self._adapter.predict_batch(inputs)
         if fluid._non_static_mode() and self._input_info is None:
@@ -1204,12 +1313,13 @@ class Model(object):
         return loss
 
     def save(self, path, training=True):
-        """  
-        This function saves parameters, optimizer information or model and 
+        """
+
+        This function saves parameters, optimizer information or model and
         paramters only for inference to path. It depends on the parameter
         `training`.
 
-        If `training` is set to True, the parameters saved contain all 
+        If `training` is set to True, the parameters saved contain all
         the trainable Variable, will save to a file with suffix ".pdparams".
         The optimizer information contains all the variable used by optimizer.
         For Adam optimizer, contains beta1, beta2, momentum etc. All the
@@ -1268,10 +1378,11 @@ class Model(object):
                     T.Normalize([127.5], [127.5])
                 ])
                 data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
-                
+
                 model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
+
         """
 
         if ParallelEnv().local_rank == 0:
@@ -1282,6 +1393,7 @@ class Model(object):
 
     def load(self, path, skip_mismatch=False, reset_optimizer=False):
         """
+
         Load from files storing the model states and optimizer states. The file
         for optimizer states is not necessary if no need to restore the optimizer.
 
@@ -1329,6 +1441,7 @@ class Model(object):
 
                 model.save('checkpoint/test')
                 model.load('checkpoint/test')
+
         """
 
         def _load_state_from_path(path):
@@ -1341,17 +1454,24 @@ class Model(object):
             state = param_state.get(key, None)
             if state is None:
                 raise ValueError(
-                    "{} is not found in the providing file.".format(key))
+                    "{} is not found in the providing file.".format(key)
+                )
             if list(state.shape) != list(param.shape):
                 raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
+                    "{} receives a shape {}, but the expected shape is {}.".format(
+                        key, list(state.shape), list(param.shape)
+                    )
+                )
             return param, state
 
         def _strip_postfix(path):
             path, ext = os.path.splitext(path)
-            assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
-                    "Unknown postfix {} from weights".format(ext)
+            assert ext in [
+                '',
+                '.pdparams',
+                '.pdopt',
+                '.pdmodel',
+            ], "Unknown postfix {} from weights".format(ext)
             return path
 
         path = _strip_postfix(path)
@@ -1365,15 +1485,17 @@ class Model(object):
             except ValueError as err:
                 if skip_mismatch:
                     warnings.warn(
-                        ("Skip loading for {}. ".format(key) + str(err)))
+                        ("Skip loading for {}. ".format(key) + str(err))
+                    )
                     # reset optimizer when mismatch happens
                     reset_optimizer = True
                 else:
                     raise err
             matched_param_state.append(match_res)
 
-        optim_state = None if reset_optimizer else _load_state_from_path(
-            path + ".pdopt")
+        optim_state = (
+            None if reset_optimizer else _load_state_from_path(path + ".pdopt")
+        )
 
         # TODO: support save/load scaler state in static graph
         if _non_static_mode():
@@ -1382,13 +1504,15 @@ class Model(object):
                 if os.path.exists(path + '.pdscaler'):
                     scaler_state = paddle.load(path + '.pdscaler')
 
-            return self._adapter.load(matched_param_state, optim_state,
-                                      scaler_state)
+            return self._adapter.load(
+                matched_param_state, optim_state, scaler_state
+            )
         else:
             return self._adapter.load(matched_param_state, optim_state)
 
     def parameters(self, *args, **kwargs):
         """
+
         Returns a list of parameters of the model.
 
         Returns:
@@ -1398,30 +1522,32 @@ class Model(object):
         Examples:
 
             .. code-block:: python
-            
+
                 import paddle
                 import paddle.nn as nn
                 from paddle.static import InputSpec
 
                 input = InputSpec([None, 784], 'float32', 'x')
-                
+
                 model = paddle.Model(nn.Sequential(
                     nn.Linear(784, 200),
                     nn.Tanh(),
                     nn.Linear(200, 10)), input)
 
                 params = model.parameters()
+
         """
         return self._adapter.parameters()
 
     def _prepare_amp(self, amp_configs):
-
         def _check_pure_fp16_configs():
             # pure float16 training has some restricts now
             if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
                 # clip by value is not supported
-                assert isinstance(self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm)), \
-                     "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
+                assert isinstance(
+                    self._optimizer._grad_clip,
+                    (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
+                ), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
 
         self._adapter._amp_custom_lists = {}
         self._adapter._amp_configs = {}
@@ -1433,7 +1559,8 @@ class Model(object):
         elif isinstance(amp_configs, str):
             if amp_configs not in ('O0', 'O1', 'O2'):
                 raise ValueError(
-                    "The level of amp_configs should be 'O0', 'O1' or 'O2'.")
+                    "The level of amp_configs should be 'O0', 'O1' or 'O2'."
+                )
             self._adapter._amp_level = amp_configs
             _check_pure_fp16_configs()
             return
@@ -1442,7 +1569,8 @@ class Model(object):
                 self._adapter._amp_level = 'O1'
             elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
                 raise ValueError(
-                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
+                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'."
+                )
             else:
                 self._adapter._amp_level = amp_configs['level']
         amp_config_key_set = set(amp_configs.keys()) - {'level'}
@@ -1459,12 +1587,14 @@ class Model(object):
         # construct amp_custom_lists
         if self._adapter._amp_level != 'O0' and amp_config_key_set:
             for param_name in [
-                    'custom_white_list', 'custom_black_list',
-                    'custom_black_varnames'
+                'custom_white_list',
+                'custom_black_list',
+                'custom_black_varnames',
             ]:
                 if param_name in amp_config_key_set:
                     self._adapter._amp_custom_lists[param_name] = amp_configs[
-                        param_name]
+                        param_name
+                    ]
                     amp_config_key_set -= {param_name}
 
         def _check_amp_configs(amp_config_key_set):
@@ -1479,13 +1609,16 @@ class Model(object):
             }
             if amp_config_key_set - accepted_param_set:
                 raise ValueError(
-                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized."
-                    .format(tuple(amp_config_key_set - accepted_param_set)))
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
+                        tuple(amp_config_key_set - accepted_param_set)
+                    )
+                )
 
             if 'use_fp16_guard' in amp_config_key_set:
                 if _non_static_mode():
                     raise ValueError(
-                        "'use_fp16_guard' is supported in static mode only.")
+                        "'use_fp16_guard' is supported in static mode only."
+                    )
                 self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
                 amp_config_key_set.remove('use_fp16_guard')
 
@@ -1495,12 +1628,11 @@ class Model(object):
         for key in amp_configs_set:
             self._adapter._amp_configs[key] = amp_configs[key]
 
-    def prepare(self,
-                optimizer=None,
-                loss=None,
-                metrics=None,
-                amp_configs=None):
+    def prepare(
+        self, optimizer=None, loss=None, metrics=None, amp_configs=None
+    ):
         """
+
         Configures the model before runing.
 
         Args:
@@ -1532,6 +1664,7 @@ class Model(object):
 
         Returns:
             None
+
         """
         self._place = _get_device()
         if isinstance(self._place, fluid.CUDAPlace):
@@ -1539,15 +1672,17 @@ class Model(object):
             if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
                 if fluid._non_static_mode():
                     main_prog_seed = fluid.default_main_program().random_seed
-                    startup_prog_seed = fluid.default_startup_program(
-                    ).random_seed
+                    startup_prog_seed = (
+                        fluid.default_startup_program().random_seed
+                    )
                     fluid.disable_dygraph()
                     paddle.disable_static(self._place)
                     # enable_dygraph would create and switch to a new program,
                     # thus also copy seed to the new program
                     fluid.default_main_program().random_seed = main_prog_seed
-                    fluid.default_startup_program(
-                    ).random_seed = startup_prog_seed
+                    fluid.default_startup_program().random_seed = (
+                        startup_prog_seed
+                    )
                 else:
                     prepare_distributed_context(self._place)
                 _parallel_context_initialized = True
@@ -1562,43 +1697,46 @@ class Model(object):
 
         metrics = metrics or []
         for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
-                "{} is not sub class of Metric".format(
-                    metric.__class__.__name__)
+            assert isinstance(
+                metric, Metric
+            ), "{} is not sub class of Metric".format(metric.__class__.__name__)
         self._metrics = to_list(metrics)
         self._prepare_amp(amp_configs)
 
         self._adapter.prepare()
 
-    def fit(self,
-            train_data=None,
-            eval_data=None,
-            batch_size=1,
-            epochs=1,
-            eval_freq=1,
-            log_freq=10,
-            save_dir=None,
-            save_freq=1,
-            verbose=2,
-            drop_last=False,
-            shuffle=True,
-            num_workers=0,
-            callbacks=None,
-            accumulate_grad_batches=1,
-            num_iters=None):
+    def fit(
+        self,
+        train_data=None,
+        eval_data=None,
+        batch_size=1,
+        epochs=1,
+        eval_freq=1,
+        log_freq=10,
+        save_dir=None,
+        save_freq=1,
+        verbose=2,
+        drop_last=False,
+        shuffle=True,
+        num_workers=0,
+        callbacks=None,
+        accumulate_grad_batches=1,
+        num_iters=None,
+    ):
         """
+
         Trains the model for a fixed number of epochs. If `eval_data` is set,
         evaluation will be done at the end of each epoch.
 
         Args:
-            train_data (Dataset|DataLoader, optional): An iterable data loader is used for 
-                train. An instance of paddle paddle.io.Dataset or 
+            train_data (Dataset|DataLoader, optional): An iterable data loader is used for
+                train. An instance of paddle paddle.io.Dataset or
                 paddle.io.Dataloader is recomended. Default: None.
             eval_data (Dataset|DataLoader, optional): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.io.Dataset or paddle.io.Dataloader 
+                evaluation at the end of epoch. If None, will not do evaluation.
+                An instance of paddle.io.Dataset or paddle.io.Dataloader
                 is recomended. Default: None.
-            batch_size (int, optional): The batch size of train_data and eval_data. When 
+            batch_size (int, optional): The batch size of train_data and eval_data. When
                 train_data and eval_data are both the instance of Dataloader, this
                 parameter will be ignored. Default: 1.
             epochs (int, optional): The number of epochs to train the model. Default: 1.
@@ -1626,7 +1764,7 @@ class Model(object):
             callbacks (Callback|None, optional): A list of `Callback` instances to apply
                 during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and
                 :ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None.
-            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident 
+            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident
                 during training process before optimizer updates. It can mimic large batch
                 size. Default: 1.
             num_iters (int|None, optional): The number of iterations to evaluate the model.
@@ -1641,7 +1779,7 @@ class Model(object):
                How to make a batch is done internally.
 
             .. code-block:: python
-              :name: code-example1
+              :name: code-example3
 
                 import paddle
                 import paddle.vision.transforms as T
@@ -1681,7 +1819,7 @@ class Model(object):
                DataLoader.
 
             .. code-block:: python
-              :name: code-example2
+              :name: code-example4
 
                 import paddle
                 import paddle.vision.transforms as T
@@ -1691,7 +1829,7 @@ class Model(object):
                 dynamic = True
                 if not dynamic:
                     paddle.enable_static()
-                
+
                 transform = T.Compose([
                         T.Transpose(),
                         T.Normalize([127.5], [127.5])
@@ -1718,31 +1856,38 @@ class Model(object):
                             val_loader,
                             epochs=2,
                             save_dir='mnist_checkpoint')
+
         """
-        assert train_data is not None, \
-                "train_data must be given!"
+        assert train_data is not None, "train_data must be given!"
 
         if isinstance(train_data, Dataset):
-            train_sampler = DistributedBatchSampler(train_data,
-                                                    batch_size=batch_size,
-                                                    shuffle=shuffle,
-                                                    drop_last=drop_last)
-            train_loader = DataLoader(train_data,
-                                      batch_sampler=train_sampler,
-                                      places=self._place,
-                                      num_workers=num_workers,
-                                      return_list=True)
+            train_sampler = DistributedBatchSampler(
+                train_data,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                drop_last=drop_last,
+            )
+            train_loader = DataLoader(
+                train_data,
+                batch_sampler=train_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
         else:
             train_loader = train_data
 
         if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(eval_data,
-                                                   batch_size=batch_size)
-            eval_loader = DataLoader(eval_data,
-                                     batch_sampler=eval_sampler,
-                                     places=self._place,
-                                     num_workers=num_workers,
-                                     return_list=True)
+            eval_sampler = DistributedBatchSampler(
+                eval_data, batch_size=batch_size
+            )
+            eval_loader = DataLoader(
+                eval_data,
+                batch_sampler=eval_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
         elif eval_data is not None:
             eval_loader = eval_data
         else:
@@ -1755,8 +1900,11 @@ class Model(object):
 
         steps = self._len_data_loader(train_loader)
         self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int) and isinstance(
-                steps, int):
+        if (
+            num_iters is not None
+            and isinstance(num_iters, int)
+            and isinstance(steps, int)
+        ):
             assert num_iters > 0, "num_iters must be greater than 0!"
             epochs = (num_iters // steps) + 1
             steps = min(num_iters, steps)
@@ -1784,10 +1932,10 @@ class Model(object):
             if do_eval and epoch % eval_freq == 0:
 
                 eval_steps = self._len_data_loader(eval_loader)
-                cbks.on_begin('eval', {
-                    'steps': eval_steps,
-                    'metrics': self._metrics_name()
-                })
+                cbks.on_begin(
+                    'eval',
+                    {'steps': eval_steps, 'metrics': self._metrics_name()},
+                )
 
                 eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
 
@@ -1798,20 +1946,22 @@ class Model(object):
         cbks.on_end('train', logs)
         self._test_dataloader = None
 
-    def evaluate(self,
-                 eval_data,
-                 batch_size=1,
-                 log_freq=10,
-                 verbose=2,
-                 num_workers=0,
-                 callbacks=None,
-                 num_iters=None):
+    def evaluate(
+        self,
+        eval_data,
+        batch_size=1,
+        log_freq=10,
+        verbose=2,
+        num_workers=0,
+        callbacks=None,
+        num_iters=None,
+    ):
         """
         Evaluate the loss and metrics of the model on input dataset.
 
         Args:
             eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.io.Dataset or 
+                evaluation. An instance of paddle.io.Dataset or
                 paddle.io.Dataloader is recomended.
             batch_size (int, optional): The batch size of train_data and eval_data.
                 When eval_data is the instance of Dataloader, this argument will be
@@ -1859,13 +2009,16 @@ class Model(object):
         """
 
         if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(eval_data,
-                                                   batch_size=batch_size)
-            eval_loader = DataLoader(eval_data,
-                                     batch_sampler=eval_sampler,
-                                     places=self._place,
-                                     num_workers=num_workers,
-                                     return_list=True)
+            eval_sampler = DistributedBatchSampler(
+                eval_data, batch_size=batch_size
+            )
+            eval_loader = DataLoader(
+                eval_data,
+                batch_sampler=eval_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
         else:
             eval_loader = eval_data
 
@@ -1881,15 +2034,17 @@ class Model(object):
 
         eval_steps = self._len_data_loader(eval_loader)
         self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int) and isinstance(
-                eval_steps, int):
+        if (
+            num_iters is not None
+            and isinstance(num_iters, int)
+            and isinstance(eval_steps, int)
+        ):
             assert num_iters > 0, "num_iters must be greater than 0!"
             eval_steps = min(num_iters, eval_steps)
             self.num_iters = eval_steps
-        cbks.on_begin('eval', {
-            'steps': eval_steps,
-            'metrics': self._metrics_name()
-        })
+        cbks.on_begin(
+            'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
+        )
 
         logs = self._run_one_epoch(eval_loader, cbks, 'eval')
 
@@ -1903,13 +2058,15 @@ class Model(object):
 
         return eval_result
 
-    def predict(self,
-                test_data,
-                batch_size=1,
-                num_workers=0,
-                stack_outputs=False,
-                verbose=1,
-                callbacks=None):
+    def predict(
+        self,
+        test_data,
+        batch_size=1,
+        num_workers=0,
+        stack_outputs=False,
+        verbose=1,
+        callbacks=None,
+    ):
         """
         Compute the output predictions on testing data.
 
@@ -1919,7 +2076,7 @@ class Model(object):
                 is recomended.
             batch_size (int, optional): The batch size of test_data. When test_data is the
                 instance of Dataloader, this argument will be ignored. Default: 1.
-            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess 
+            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess
                 used and loading data in main process. When test_data is the instance of Dataloader,
                 this argument will be ignored. Default: 0.
             stack_outputs (bool, optional): Whether stack output field like a batch, as for an output
@@ -1980,13 +2137,16 @@ class Model(object):
         """
 
         if test_data is not None and isinstance(test_data, Dataset):
-            test_sampler = DistributedBatchSampler(test_data,
-                                                   batch_size=batch_size)
-            test_loader = DataLoader(test_data,
-                                     batch_sampler=test_sampler,
-                                     places=self._place,
-                                     num_workers=num_workers,
-                                     return_list=True)
+            test_sampler = DistributedBatchSampler(
+                test_data, batch_size=batch_size
+            )
+            test_loader = DataLoader(
+                test_data,
+                batch_sampler=test_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
         else:
             test_loader = test_data
 
@@ -2036,7 +2196,8 @@ class Model(object):
                 if self._is_shape_inferred:
                     warnings.warn(
                         "'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
-                        % self._input_info[0])
+                        % self._input_info[0]
+                    )
 
                 paddle.jit.save(layer, path, input_spec=self._inputs)
 
@@ -2047,7 +2208,8 @@ class Model(object):
                 raise ValueError(
                     "The input path MUST be format of dirname/file_prefix "
                     "[dirname\\file_prefix in Windows system], but received "
-                    "file_prefix is empty string.")
+                    "file_prefix is empty string."
+                )
 
             dirname = os.path.dirname(path)
             if dirname and not os.path.exists(dirname):
@@ -2058,21 +2220,24 @@ class Model(object):
             params_filename = file_prefix + INFER_PARAMS_SUFFIX
 
             prog = self._adapter._progs.get('test', None)
-            assert prog, \
-                "Model is not ready, please call `model.prepare()` first"
+            assert (
+                prog
+            ), "Model is not ready, please call `model.prepare()` first"
 
             infer_prog = prog.clone(for_test=True)
 
             input_names = [v.name for v in self._adapter._input_vars['test']]
             endpoints = self._adapter._endpoints['test']['output']
 
-            fluid.io.save_inference_model(model_path,
-                                          input_names,
-                                          endpoints,
-                                          self._adapter._executor,
-                                          main_program=infer_prog,
-                                          model_filename=model_filename,
-                                          params_filename=params_filename)
+            fluid.io.save_inference_model(
+                model_path,
+                input_names,
+                endpoints,
+                self._adapter._executor,
+                main_program=infer_prog,
+                model_filename=model_filename,
+                params_filename=params_filename,
+            )
 
     def _run_one_epoch(
         self,
@@ -2098,16 +2263,21 @@ class Model(object):
             # LoDTensor.shape is callable, where LoDTensor comes from
             # DataLoader in static graph
 
-            batch_size = data[0].shape()[0] if callable(
-                data[0].shape) else data[0].shape[0]
+            batch_size = (
+                data[0].shape()[0]
+                if callable(data[0].shape)
+                else data[0].shape[0]
+            )
 
             callbacks.on_batch_begin(mode, step, logs)
 
             if mode != 'predict':
-                _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
+                _inputs = [data[: len(self._inputs)], data[len(self._inputs) :]]
                 if mode == 'train':
-                    _inputs.append((step + 1) % self._accumulate == 0
-                                   or step + 1 == len(data_loader))
+                    _inputs.append(
+                        (step + 1) % self._accumulate == 0
+                        or step + 1 == len(data_loader)
+                    )
 
                 outs = getattr(self, mode + '_batch')(*_inputs)
 
@@ -2128,15 +2298,17 @@ class Model(object):
                     logs[k] = v
             else:
                 if self._inputs is not None:
-                    outs = self.predict_batch(data[:len(self._inputs)])
+                    outs = self.predict_batch(data[: len(self._inputs)])
                 else:
                     outs = self.predict_batch(data)
 
                 outputs.append(outs)
 
             logs['step'] = step
-            if mode == 'train' or self._adapter._merge_count.get(
-                    mode + '_batch', 0) <= 0:
+            if (
+                mode == 'train'
+                or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
+            ):
                 logs['batch_size'] = batch_size * ParallelEnv().nranks
             else:
                 logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
@@ -2158,10 +2330,10 @@ class Model(object):
         """Prints a string summary of the network.
 
         Args:
-            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
-                    if not set, input_size will get from ``self._inputs`` if network only have 
-                    one input, input_size can be tuple or InputSpec. if model have multiple 
-                    input, input_size must be a list which contain every input's shape. 
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
+                    if not set, input_size will get from ``self._inputs`` if network only have
+                    one input, input_size can be tuple or InputSpec. if model have multiple
+                    input, input_size must be a list which contain every input's shape.
                     Default: None.
             dtype (str, optional): if dtype is None, 'float32' will be used, Default: None.
 
@@ -2190,8 +2362,9 @@ class Model(object):
                 # {'total_params': 61610, 'trainable_params': 61610}
 
         """
-        assert (input_size is not None or self._inputs
-                is not None), "'input_size' or 'self._input' must be set"
+        assert (
+            input_size is not None or self._inputs is not None
+        ), "'input_size' or 'self._input' must be set"
         if input_size is not None:
             _input_size = input_size
         else:
@@ -2208,7 +2381,10 @@ class Model(object):
             if is_input:
                 arg_names = extract_args(self.network.forward)[1:]
                 # While Saving inference model in dygraph, and providing inputs only in running.
-                if shapes is not None and dtypes is not None and fluid._non_static_mode(
+                if (
+                    shapes is not None
+                    and dtypes is not None
+                    and fluid._non_static_mode()
                 ):
                     out_specs = [
                         Input(name=n, dtype=dtypes[i], shape=shapes[i])
@@ -2221,7 +2397,8 @@ class Model(object):
         elif isinstance(specs, dict):
             assert is_input is False
             out_specs = [
-                specs[n] for n in extract_args(self.network.forward)
+                specs[n]
+                for n in extract_args(self.network.forward)
                 if n != 'self'
             ]
         else:
@@ -2232,8 +2409,10 @@ class Model(object):
                 assert isinstance(spec, Input)
                 if spec.name is None:
                     raise ValueError(
-                        "Requires Input[{}].name != None, but receive `None` with {}."
-                        .format(i, spec))
+                        "Requires Input[{}].name != None, but receive `None` with {}.".format(
+                            i, spec
+                        )
+                    )
 
         return out_specs
 
@@ -2258,6 +2437,7 @@ class Model(object):
         "Update self._inputs according to given inputs."
         self._input_info = self._adapter._input_info
         if self._input_info is not None and len(self._input_info) == 2:
-            self._inputs = self._verify_spec(None, self._input_info[0],
-                                             self._input_info[1], True)
+            self._inputs = self._verify_spec(
+                None, self._input_info[0], self._input_info[1], True
+            )
             self._is_shape_inferred = True
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index b2cf55d26488bf6690174dcbb7ba80b1accf94d6..02b844751a8898ee3a6908c0b8c84c3f0b9a89eb 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm(
     name=None,
 ):
     r"""
+
     The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
 
     .. code-block:: python
+
         y = layer_norm(residual + dropout(bias + x))
 
     Parameters:
@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: The output Tensor, the data type and shape is same as `x`.
+        Tensor, The output Tensor, the data type and shape is same as `x`.
 
     Examples:
-
         .. code-block:: python
 
             # required: gpu
@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm(
                 x, residual, bias)
             # [2, 4, 128]
             print(output.shape)
+
     """
     seed = None
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index ba14ac5b8652990fa08539f168f3dbd610e66275..c3655c9d93a2745c7bfd8fbc5f6c00621121bf55 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f
 from paddle.nn import Layer
 from paddle.framework import ParamAttr
 import paddle
-from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list
+from paddle.nn.layer.transformer import (
+    _convert_attention_mask,
+    _convert_param_attr_to_list,
+)
 from paddle.nn.initializer import Constant
 from paddle.fluid.dygraph import no_grad
 from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode
@@ -51,7 +54,8 @@ def _to_dtype(t, dtype):
     if t.place.is_gpu_place():
         size_dtype = core.size_of_dtype(dtype)
         waiting_alloc_memory = (
-            (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+        )
         gpu_memory_available = core.gpu_memory_available()
         if gpu_memory_available < waiting_alloc_memory:
             t_used = t._copy_to(paddle.CPUPlace(), False)
@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
             output = fused_bias_dropout_residual_ln(x, residual)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 embed_dim,
-                 dropout_rate=0.5,
-                 weight_attr=None,
-                 bias_attr=None,
-                 epsilon=1e-5,
-                 name=None):
+    def __init__(
+        self,
+        embed_dim,
+        dropout_rate=0.5,
+        weight_attr=None,
+        bias_attr=None,
+        epsilon=1e-5,
+        name=None,
+    ):
         super(FusedBiasDropoutResidualLayerNorm, self).__init__()
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+        assert embed_dim > 0, (
+            "Expected embed_dim to be greater than 0, "
+            "but recieved {}".format(embed_dim)
+        )
         self._dtype = self._helper.get_default_dtype()
         self._bias_attr = bias_attr
         self._weight_attr = weight_attr
         self.embed_dim = embed_dim
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
-                                                 attr=self._bias_attr,
-                                                 dtype=self._dtype,
-                                                 is_bias=True)
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
         self.ln_scale = self.create_parameter(
             attr=self._weight_attr,
             shape=[embed_dim],
-            default_initializer=Constant(value=1.0))
-        self.ln_bias = self.create_parameter(attr=self._bias_attr,
-                                             shape=[embed_dim],
-                                             is_bias=True)
+            default_initializer=Constant(value=1.0),
+        )
+        self.ln_bias = self.create_parameter(
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True
+        )
         self.dropout_rate = dropout_rate
         self._epsilon = epsilon
 
@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
             ln_epsilon=self._epsilon,
             training=self.training,
             mode='upscale_in_train',
-            name=self.name)
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
-            self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
-            self._dtype, name_str)
+            self.embed_dim,
+            self.seq_len,
+            self.dropout_rate,
+            self._epsilon,
+            self._dtype,
+            name_str,
+        )
 
 
 class FusedMultiHeadAttention(Layer):
@@ -246,33 +263,40 @@ class FusedMultiHeadAttention(Layer):
             output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 embed_dim,
-                 num_heads,
-                 dropout_rate=0.5,
-                 attn_dropout_rate=0.5,
-                 kdim=None,
-                 vdim=None,
-                 normalize_before=False,
-                 need_weights=False,
-                 qkv_weight_attr=None,
-                 qkv_bias_attr=None,
-                 linear_weight_attr=None,
-                 linear_bias_attr=None,
-                 pre_ln_scale_attr=None,
-                 pre_ln_bias_attr=None,
-                 ln_scale_attr=None,
-                 ln_bias_attr=None,
-                 epsilon=1e-5,
-                 nranks=1,
-                 ring_id=-1,
-                 name=None):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout_rate=0.5,
+        attn_dropout_rate=0.5,
+        kdim=None,
+        vdim=None,
+        normalize_before=False,
+        need_weights=False,
+        qkv_weight_attr=None,
+        qkv_bias_attr=None,
+        linear_weight_attr=None,
+        linear_bias_attr=None,
+        pre_ln_scale_attr=None,
+        pre_ln_bias_attr=None,
+        ln_scale_attr=None,
+        ln_bias_attr=None,
+        epsilon=1e-5,
+        nranks=1,
+        ring_id=-1,
+        name=None,
+    ):
         super(FusedMultiHeadAttention, self).__init__()
 
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but received {}".format(embed_dim))
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but received {}".format(num_heads))
+        assert embed_dim > 0, (
+            "Expected embed_dim to be greater than 0, "
+            "but received {}".format(embed_dim)
+        )
+        assert (
+            num_heads > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            num_heads
+        )
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer):
         self.kdim = kdim
         self.vdim = vdim
         self.need_weights = need_weights
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
         assert need_weights is False, "Only support need_weight is False now."
 
         # tensor model parallel
@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer):
             shape=[3, num_heads, self.head_dim, embed_dim],
             attr=qkv_weight_attr,
             dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
+        )
         self.qkv_bias = self.create_parameter(
             shape=[3, num_heads, self.head_dim],
             attr=qkv_bias_attr,
             dtype=self._dtype,
-            is_bias=True)
+            is_bias=True,
+        )
         self.linear_weight = self.create_parameter(
             shape=[num_heads * self.head_dim, embed_dim],
             attr=linear_weight_attr,
             dtype=self._dtype,
-            is_bias=False)
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
-                                                 attr=linear_bias_attr,
-                                                 dtype=self._dtype,
-                                                 is_bias=True)
+            is_bias=False,
+        )
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=linear_bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
 
         # tensor model parallel
         if nranks > 1:
@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer):
             self.pre_ln_scale = self.create_parameter(
                 attr=pre_ln_scale_attr,
                 shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
-                                                     shape=[embed_dim],
-                                                     is_bias=True)
+                default_initializer=Constant(value=1.0),
+            )
+            self.pre_ln_bias = self.create_parameter(
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
             self.ln_scale = None
             self.ln_bias = None
         else:
@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer):
             self.ln_scale = self.create_parameter(
                 attr=ln_scale_attr,
                 shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            self.ln_bias = self.create_parameter(attr=ln_bias_attr,
-                                                 shape=[embed_dim],
-                                                 is_bias=True)
+                default_initializer=Constant(value=1.0),
+            )
+            self.ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
 
         self.dropout_rate = dropout_rate
         self.attn_dropout_rate = attn_dropout_rate
@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer):
             ln_epsilon=self._epsilon,
             training=self.training,
             ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
-            self.embed_dim, self.num_heads, self.dropout_rate,
-            self.attn_dropout_rate, self._epsilon, self.kdim, self.vdim,
-            self.normalize_before, self.need_weights, self._dtype, name_str)
+            self.embed_dim,
+            self.num_heads,
+            self.dropout_rate,
+            self.attn_dropout_rate,
+            self._epsilon,
+            self.kdim,
+            self.vdim,
+            self.normalize_before,
+            self.need_weights,
+            self._dtype,
+            name_str,
+        )
 
     def _amp_decorate(self, dtype):
         # tmp fix for amp.decorator(O2)
@@ -495,33 +538,39 @@ class FusedFeedForward(Layer):
             # (1, 8, 8)
     """
 
-    def __init__(self,
-                 d_model,
-                 dim_feedforward,
-                 dropout_rate=0.1,
-                 epsilon=1e-05,
-                 activation="relu",
-                 act_dropout_rate=None,
-                 normalize_before=False,
-                 linear1_weight_attr=None,
-                 linear1_bias_attr=None,
-                 linear2_weight_attr=None,
-                 linear2_bias_attr=None,
-                 ln1_scale_attr=None,
-                 ln1_bias_attr=None,
-                 ln2_scale_attr=None,
-                 ln2_bias_attr=None,
-                 nranks=1,
-                 ring_id=-1,
-                 name=None):
+    def __init__(
+        self,
+        d_model,
+        dim_feedforward,
+        dropout_rate=0.1,
+        epsilon=1e-05,
+        activation="relu",
+        act_dropout_rate=None,
+        normalize_before=False,
+        linear1_weight_attr=None,
+        linear1_bias_attr=None,
+        linear2_weight_attr=None,
+        linear2_bias_attr=None,
+        ln1_scale_attr=None,
+        ln1_bias_attr=None,
+        ln2_scale_attr=None,
+        ln2_bias_attr=None,
+        nranks=1,
+        ring_id=-1,
+        name=None,
+    ):
 
         super(FusedFeedForward, self).__init__()
-        assert d_model > 0, (
-            "Expected d_model to be greater than 0, but received {}".format(
-                d_model))
-        assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but received {}".
-            format(dim_feedforward))
+        assert (
+            d_model > 0
+        ), "Expected d_model to be greater than 0, but received {}".format(
+            d_model
+        )
+        assert (
+            dim_feedforward > 0
+        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
+            dim_feedforward
+        )
 
         self._dtype = self._helper.get_default_dtype()
         self._d_model = d_model
@@ -530,7 +579,9 @@ class FusedFeedForward(Layer):
         dim_feedforward = dim_feedforward // nranks
         self._dim_feedforward = dim_feedforward
         self._dropout_rate = dropout_rate
-        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_dropout_rate = (
+            dropout_rate if act_dropout_rate is None else act_dropout_rate
+        )
         self._act_method = activation
         self._normalize_before = normalize_before
         self._epsilon = epsilon
@@ -540,22 +591,28 @@ class FusedFeedForward(Layer):
             shape=[d_model, dim_feedforward],
             attr=linear1_weight_attr,
             dtype=self._dtype,
-            is_bias=False)
-        self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
-                                                   attr=linear1_bias_attr,
-                                                   dtype=self._dtype,
-                                                   is_bias=True)
+            is_bias=False,
+        )
+        self._linear1_bias = self.create_parameter(
+            shape=[dim_feedforward],
+            attr=linear1_bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
 
         self._linear2_weight = self.create_parameter(
             shape=[dim_feedforward, d_model],
             attr=linear2_weight_attr,
             dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
+        )
 
-        self._linear2_bias = self.create_parameter(shape=[d_model],
-                                                   attr=linear2_bias_attr,
-                                                   dtype=self._dtype,
-                                                   is_bias=True)
+        self._linear2_bias = self.create_parameter(
+            shape=[d_model],
+            attr=linear2_bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
 
         if nranks > 1:
             assert ring_id != -1
@@ -569,10 +626,11 @@ class FusedFeedForward(Layer):
                 shape=[d_model],
                 attr=ln1_scale_attr,
                 is_bias=False,
-                default_initializer=Constant(1.0))
-            self._ln1_bias = self.create_parameter(shape=[d_model],
-                                                   attr=ln1_bias_attr,
-                                                   is_bias=True)
+                default_initializer=Constant(1.0),
+            )
+            self._ln1_bias = self.create_parameter(
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True
+            )
             self._ln2_scale = None
             self._ln2_bias = None
         else:
@@ -582,10 +640,11 @@ class FusedFeedForward(Layer):
                 shape=[d_model],
                 attr=ln2_scale_attr,
                 is_bias=False,
-                default_initializer=Constant(1.0))
-            self._ln2_bias = self.create_parameter(shape=[d_model],
-                                                   attr=ln2_bias_attr,
-                                                   is_bias=True)
+                default_initializer=Constant(1.0),
+            )
+            self._ln2_bias = self.create_parameter(
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True
+            )
 
         self.name = name
 
@@ -608,15 +667,23 @@ class FusedFeedForward(Layer):
             pre_layer_norm=self._normalize_before,
             training=self.training,
             ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
-            self._d_model, self._dim_feedforward, self._dropout_rate,
-            self._epsilon, self._act_method, self._act_dropout_rate,
-            self._normalize_before, self._dtype, name_str)
+            self._d_model,
+            self._dim_feedforward,
+            self._dropout_rate,
+            self._epsilon,
+            self._act_method,
+            self._act_dropout_rate,
+            self._normalize_before,
+            self._dtype,
+            name_str,
+        )
 
     def _amp_decorate(self, dtype):
         # tmp fix for amp.decorator(O2)
@@ -640,6 +707,7 @@ class FusedFeedForward(Layer):
 
 class FusedTransformerEncoderLayer(Layer):
     """
+
     FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
     attention and feedforward network. Before and after each sub-layer, pre-process
     and post-precess would be applied on the input and output accordingly. If
@@ -681,10 +749,9 @@ class FusedTransformerEncoderLayer(Layer):
 
 
     Examples:
-
         .. code-block:: python
 
-	    # required: gpu
+            # required: gpu
             import paddle
             from paddle.incubate.nn import FusedTransformerEncoderLayer
 
@@ -694,33 +761,47 @@ class FusedTransformerEncoderLayer(Layer):
             attn_mask = paddle.rand((2, 2, 4, 4))
             encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
             enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
+
     """
 
-    def __init__(self,
-                 d_model,
-                 nhead,
-                 dim_feedforward,
-                 dropout_rate=0.1,
-                 activation="relu",
-                 attn_dropout_rate=None,
-                 act_dropout_rate=None,
-                 normalize_before=False,
-                 weight_attr=None,
-                 bias_attr=None):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward,
+        dropout_rate=0.1,
+        activation="relu",
+        attn_dropout_rate=None,
+        act_dropout_rate=None,
+        normalize_before=False,
+        weight_attr=None,
+        bias_attr=None,
+    ):
         self._config = locals()
         self._config.pop("self")
         self._config.pop("__class__", None)  # py3
 
         super(FusedTransformerEncoderLayer, self).__init__()
-        assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but received {}".format(d_model))
-        assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but received {}".format(nhead))
+        assert (
+            d_model > 0
+        ), "Expected d_model to be greater than 0, " "but received {}".format(
+            d_model
+        )
+        assert (
+            nhead > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            nhead
+        )
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but received {}".format(dim_feedforward))
-        attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
-        act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+            "but received {}".format(dim_feedforward)
+        )
+        attn_dropout_rate = (
+            dropout_rate if attn_dropout_rate is None else attn_dropout_rate
+        )
+        act_dropout_rate = (
+            dropout_rate if act_dropout_rate is None else act_dropout_rate
+        )
         self.normalize_before = normalize_before
 
         weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
@@ -739,22 +820,27 @@ class FusedTransformerEncoderLayer(Layer):
             pre_ln_scale_attr=weight_attrs[0],
             pre_ln_bias_attr=bias_attrs[0],
             ln_scale_attr=weight_attrs[0],
-            ln_bias_attr=bias_attrs[0])
-
-        self.ffn = FusedFeedForward(d_model,
-                                    dim_feedforward,
-                                    dropout_rate=dropout_rate,
-                                    activation=activation,
-                                    act_dropout_rate=act_dropout_rate,
-                                    normalize_before=self.normalize_before,
-                                    linear1_weight_attr=weight_attrs[1],
-                                    linear1_bias_attr=bias_attrs[1],
-                                    linear2_weight_attr=weight_attrs[1],
-                                    linear2_bias_attr=bias_attrs[1])
+            ln_bias_attr=bias_attrs[0],
+        )
+
+        self.ffn = FusedFeedForward(
+            d_model,
+            dim_feedforward,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            act_dropout_rate=act_dropout_rate,
+            normalize_before=self.normalize_before,
+            linear1_weight_attr=weight_attrs[1],
+            linear1_bias_attr=bias_attrs[1],
+            linear2_weight_attr=weight_attrs[1],
+            linear2_bias_attr=bias_attrs[1],
+        )
 
     def forward(self, src, src_mask=None, cache=None):
         """
+
         Applies a Transformer encoder layer on the input.
+
         Parameters:
             src (Tensor): The input of Transformer encoder layer. It is
                 a tensor with shape `[batch_size, sequence_length, d_model]`.
@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer):
                 `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
-                See `TransformerEncoderLayer.gen_cache` for more details. It is
+                See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
                 only used for inference and should be None for training. Default
                 None.
+
         Returns:
-            Tensor|tuple: It is a tensor that has the same shape and data type \
+            Tensor|tuple, It is a tensor that has the same shape and data type \
                 as `enc_input`, representing the output of Transformer encoder \
                 layer. Or a tuple if `cache` is not None, except for encoder \
                 layer output, the tuple includes the new cache which is same \
                 as input `cache` argument but `incremental_cache` has an \
                 incremental length. See `MultiHeadAttention.gen_cache` and \
                 `MultiHeadAttention.forward` for more details.
+
         """
         src_mask = _convert_attention_mask(src_mask, src.dtype)
         if cache is None:
             attn_out = self.fused_attn(src, attn_mask=src_mask)
         else:
-            attn_out, incremental_cache = self.fused_attn(src,
-                                                          attn_mask=src_mask,
-                                                          cache=cache)
+            attn_out, incremental_cache = self.fused_attn(
+                src, attn_mask=src_mask, cache=cache
+            )
 
         ffn_out = self.ffn(attn_out)
 
@@ -889,21 +977,23 @@ class FusedTransformer(Layer):
                                  cross_attn_mask)  # [2, 6, 128]
     """
 
-    def __init__(self,
-                 d_model=512,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=6,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False,
-                 weight_attr=None,
-                 bias_attr=None,
-                 custom_encoder=None,
-                 custom_decoder=None):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+        weight_attr=None,
+        bias_attr=None,
+        custom_encoder=None,
+        custom_decoder=None,
+    ):
         super(fusedTransformer, self).__init__()
         raise NotImplementedError()
 
@@ -1071,40 +1161,49 @@ class FusedMultiTransformer(Layer):
             enc_output = encoder_layers(enc_input, attn_mask)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 embed_dim,
-                 num_heads,
-                 dim_feedforward,
-                 dropout_rate=0.0,
-                 activation="gelu",
-                 normalize_before=True,
-                 ln_scale_attrs=None,
-                 ln_bias_attrs=None,
-                 qkv_weight_attrs=None,
-                 qkv_bias_attrs=None,
-                 linear_weight_attrs=None,
-                 linear_bias_attrs=None,
-                 ffn_ln_scale_attrs=None,
-                 ffn_ln_bias_attrs=None,
-                 ffn1_weight_attrs=None,
-                 ffn1_bias_attrs=None,
-                 ffn2_weight_attrs=None,
-                 ffn2_bias_attrs=None,
-                 epsilon=1e-5,
-                 num_layers=-1,
-                 nranks=1,
-                 trans_qkvw=True,
-                 ring_id=-1,
-                 name=None):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dim_feedforward,
+        dropout_rate=0.0,
+        activation="gelu",
+        normalize_before=True,
+        ln_scale_attrs=None,
+        ln_bias_attrs=None,
+        qkv_weight_attrs=None,
+        qkv_bias_attrs=None,
+        linear_weight_attrs=None,
+        linear_bias_attrs=None,
+        ffn_ln_scale_attrs=None,
+        ffn_ln_bias_attrs=None,
+        ffn1_weight_attrs=None,
+        ffn1_bias_attrs=None,
+        ffn2_weight_attrs=None,
+        ffn2_bias_attrs=None,
+        epsilon=1e-5,
+        num_layers=-1,
+        nranks=1,
+        trans_qkvw=True,
+        ring_id=-1,
+        name=None,
+    ):
         super(FusedMultiTransformer, self).__init__()
 
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but received {}".format(embed_dim))
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but received {}".format(num_heads))
-        assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but received {}".
-            format(dim_feedforward))
+        assert embed_dim > 0, (
+            "Expected embed_dim to be greater than 0, "
+            "but received {}".format(embed_dim)
+        )
+        assert (
+            num_heads > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            num_heads
+        )
+        assert (
+            dim_feedforward > 0
+        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
+            dim_feedforward
+        )
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer):
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
 
         # tensor model parallel
         if nranks > 1:
@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer):
             ln_scale = self.create_parameter(
                 attr=ln_scale_attr,
                 shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            ln_bias = self.create_parameter(attr=ln_bias_attr,
-                                            shape=[embed_dim],
-                                            is_bias=True)
+                default_initializer=Constant(value=1.0),
+            )
+            ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
             qkv_weight = self.create_parameter(
                 shape=[3, num_heads, self.head_dim, embed_dim]
-                if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim],
+                if trans_qkvw
+                else [embed_dim, 3, num_heads, self.head_dim],
                 attr=qkv_weight_attr,
                 dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
+            )
             qkv_bias = self.create_parameter(
                 shape=[3, num_heads, self.head_dim],
                 attr=qkv_bias_attr,
                 dtype=self._dtype,
-                is_bias=True)
+                is_bias=True,
+            )
             linear_weight = self.create_parameter(
                 shape=[num_heads * self.head_dim, embed_dim],
                 attr=linear_weight_attr,
                 dtype=self._dtype,
-                is_bias=False)
-            linear_bias = self.create_parameter(shape=[embed_dim],
-                                                attr=linear_bias_attr,
-                                                dtype=self._dtype,
-                                                is_bias=True)
+                is_bias=False,
+            )
+            linear_bias = self.create_parameter(
+                shape=[embed_dim],
+                attr=linear_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
 
             ffn_ln_scale = self.create_parameter(
                 shape=[embed_dim],
                 attr=ffn_ln_scale_attr,
                 is_bias=False,
-                default_initializer=Constant(1.0))
-            ffn_ln_bias = self.create_parameter(shape=[embed_dim],
-                                                attr=ffn_ln_bias_attr,
-                                                is_bias=True)
+                default_initializer=Constant(1.0),
+            )
+            ffn_ln_bias = self.create_parameter(
+                shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True
+            )
             ffn1_weight = self.create_parameter(
                 shape=[embed_dim, dim_feedforward],
                 attr=ffn1_weight_attr,
                 dtype=self._dtype,
-                is_bias=False)
-            ffn1_bias = self.create_parameter(shape=[dim_feedforward],
-                                              attr=ffn1_bias_attr,
-                                              dtype=self._dtype,
-                                              is_bias=True)
+                is_bias=False,
+            )
+            ffn1_bias = self.create_parameter(
+                shape=[dim_feedforward],
+                attr=ffn1_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
             ffn2_weight = self.create_parameter(
                 shape=[dim_feedforward, embed_dim],
                 attr=ffn2_weight_attr,
                 dtype=self._dtype,
-                is_bias=False)
-            ffn2_bias = self.create_parameter(shape=[embed_dim],
-                                              attr=ffn2_bias_attr,
-                                              dtype=self._dtype,
-                                              is_bias=True)
+                is_bias=False,
+            )
+            ffn2_bias = self.create_parameter(
+                shape=[embed_dim],
+                attr=ffn2_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
 
             # tensor model parallel
             if nranks > 1:
@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer):
             mode='upscale_in_train',
             trans_qkvw=self._trans_qkvw,
             ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
         return out
diff --git a/python/paddle/incubate/operators/graph_khop_sampler.py b/python/paddle/incubate/operators/graph_khop_sampler.py
index 58e0fdafab67936f7d857131036e4d9c8ad5dccb..b23eb5e630516498810433ea1828e7b749975126 100644
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -20,104 +20,134 @@ from paddle.fluid import core
 from paddle import _C_ops, _legacy_C_ops
 
 
-def graph_khop_sampler(row,
-                       colptr,
-                       input_nodes,
-                       sample_sizes,
-                       sorted_eids=None,
-                       return_eids=False,
-                       name=None):
+def graph_khop_sampler(
+    row,
+    colptr,
+    input_nodes,
+    sample_sizes,
+    sorted_eids=None,
+    return_eids=False,
+    name=None,
+):
     """
+
     Graph Khop Sampler API.
 
-    This API is mainly used in Graph Learning domain, and the main purpose is to 
+    This API is mainly used in Graph Learning domain, and the main purpose is to
     provide high performance graph khop sampling method with subgraph reindex step.
     For example, we get the CSC(Compressed Sparse Column) format of the input graph
-    edges as `row` and `colptr`, so as to covert graph data into a suitable format 
+    edges as `row` and `colptr`, so as to covert graph data into a suitable format
     for sampling. And the `input_nodes` means the nodes we need to sample neighbors,
     and `sample_sizes` means the number of neighbors and number of layers we want
-    to sample. 
+    to sample.
 
     Args:
-        row (Tensor): One of the components of the CSC format of the input graph, and 
+        row (Tensor): One of the components of the CSC format of the input graph, and
                       the shape should be [num_edges, 1] or [num_edges]. The available
                       data type is int32, int64.
         colptr (Tensor): One of the components of the CSC format of the input graph,
-                         and the shape should be [num_nodes + 1, 1] or [num_nodes]. 
+                         and the shape should be [num_nodes + 1, 1] or [num_nodes].
                          The data type should be the same with `row`.
-        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the 
+        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                               data type should be the same with `row`.
         sample_sizes (list|tuple): The number of neighbors and number of layers we want
                                    to sample. The data type should be int, and the shape
                                    should only have one dimension.
-        sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids`
+        sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
                               is True. The shape should be [num_edges, 1], and the data
-                              type should be the same with `row`.
-        return_eids (bool): Whether to return the id of the sample edges. Default is False.
+                              type should be the same with `row`. Default is None.
+        return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        edge_src (Tensor): The src index of the output edges, also means the first column of 
-                           the edges. The shape is [num_sample_edges, 1] currently.
-        edge_dst (Tensor): The dst index of the output edges, also means the second column
-                           of the edges. The shape is [num_sample_edges, 1] currently.
-        sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes.
-        reindex_nodes (Tensor): The reindex id of the input nodes.
-        edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True.
+        - edge_src (Tensor), The src index of the output edges, also means the first column of
+          the edges. The shape is [num_sample_edges, 1] currently.
+        - edge_dst (Tensor), The dst index of the output edges, also means the second column
+          of the edges. The shape is [num_sample_edges, 1] currently.
+        - sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
+        - reindex_nodes (Tensor), The reindex id of the input nodes.
+        - edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
 
     Examples:
-        
         .. code-block:: python
 
-        import paddle
+            import paddle
+
+            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+            nodes = [0, 8, 1, 2]
+            sample_sizes = [2, 2]
+            row = paddle.to_tensor(row, dtype="int64")
+            colptr = paddle.to_tensor(colptr, dtype="int64")
+            nodes = paddle.to_tensor(nodes, dtype="int64")
 
-        row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
-        colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
-        nodes = [0, 8, 1, 2]
-        sample_sizes = [2, 2]
-        row = paddle.to_tensor(row, dtype="int64")
-        colptr = paddle.to_tensor(colptr, dtype="int64")
-        nodes = paddle.to_tensor(nodes, dtype="int64")
-        
-        edge_src, edge_dst, sample_index, reindex_nodes = \
-            paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
+            edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
 
     """
 
     if _non_static_mode():
         if return_eids:
             if sorted_eids is None:
-                raise ValueError(f"`sorted_eid` should not be None "
-                                 f"if return_eids is True.")
-            edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
-                _legacy_C_ops.graph_khop_sampler(row, sorted_eids,
-                                              colptr, input_nodes,
-                                              "sample_sizes", sample_sizes,
-                                              "return_eids", True)
+                raise ValueError(
+                    f"`sorted_eid` should not be None "
+                    f"if return_eids is True."
+                )
+            (
+                edge_src,
+                edge_dst,
+                sample_index,
+                reindex_nodes,
+                edge_eids,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                sorted_eids,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                True,
+            )
             return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
         else:
-            edge_src, edge_dst, sample_index, reindex_nodes, _ = \
-                _legacy_C_ops.graph_khop_sampler(row, None,
-                                              colptr, input_nodes,
-                                              "sample_sizes", sample_sizes,
-                                              "return_eids", False)
+            (
+                edge_src,
+                edge_dst,
+                sample_index,
+                reindex_nodes,
+                _,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                None,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                False,
+            )
             return edge_src, edge_dst, sample_index, reindex_nodes
 
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
-                             "graph_khop_sampler")
+    check_variable_and_dtype(
+        row, "Row", ("int32", "int64"), "graph_khop_sampler"
+    )
 
     if return_eids:
         if sorted_eids is None:
-            raise ValueError(f"`sorted_eid` should not be None "
-                             f"if return_eids is True.")
-        check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"),
-                                 "graph_khop_sampler")
+            raise ValueError(
+                f"`sorted_eid` should not be None " f"if return_eids is True."
+            )
+        check_variable_and_dtype(
+            sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
+        )
 
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
-                             "graph_khop_sampler")
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
-                             "graph_khop_sampler")
+    check_variable_and_dtype(
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
+    )
 
     helper = LayerHelper("graph_khop_sampler", **locals())
     edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
     sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
     reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
     edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_khop_sampler",
-                     inputs={
-                         "Row": row,
-                         "Eids": sorted_eids,
-                         "Col_Ptr": colptr,
-                         "X": input_nodes
-                     },
-                     outputs={
-                         "Out_Src": edge_src,
-                         "Out_Dst": edge_dst,
-                         "Sample_Index": sample_index,
-                         "Reindex_X": reindex_nodes,
-                         "Out_Eids": edge_eids
-                     },
-                     attrs={
-                         "sample_sizes": sample_sizes,
-                         "return_eids": return_eids
-                     })
+    helper.append_op(
+        type="graph_khop_sampler",
+        inputs={
+            "Row": row,
+            "Eids": sorted_eids,
+            "Col_Ptr": colptr,
+            "X": input_nodes,
+        },
+        outputs={
+            "Out_Src": edge_src,
+            "Out_Dst": edge_dst,
+            "Sample_Index": sample_index,
+            "Reindex_X": reindex_nodes,
+            "Out_Eids": edge_eids,
+        },
+        attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
+    )
     if return_eids:
         return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
     else:
diff --git a/python/paddle/incubate/operators/graph_reindex.py b/python/paddle/incubate/operators/graph_reindex.py
index e7e940c2750cca134024af1fcb94999ae1897ba0..f1c771ba45cdc998994b966bd8c8e28fba84c104 100644
--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops
 import paddle.utils.deprecated as deprecated
 
 
-@deprecated(since="2.4.0",
-            update_to="paddle.geometric.reindex_graph",
-            level=1,
-            reason="paddle.incubate.graph_reindex will be removed in future")
-def graph_reindex(x,
-                  neighbors,
-                  count,
-                  value_buffer=None,
-                  index_buffer=None,
-                  flag_buffer_hashtable=False,
-                  name=None):
+@deprecated(
+    since="2.4.0",
+    update_to="paddle.geometric.reindex_graph",
+    level=1,
+    reason="paddle.incubate.graph_reindex will be removed in future",
+)
+def graph_reindex(
+    x,
+    neighbors,
+    count,
+    value_buffer=None,
+    index_buffer=None,
+    flag_buffer_hashtable=False,
+    name=None,
+):
     """
+
     Graph Reindex API.
 
     This API is mainly used in Graph Learning domain, which should be used
@@ -40,11 +45,11 @@ def graph_reindex(x,
     is to reindex the ids information of the input nodes, and return the 
     corresponding graph edges after reindex.
 
-    **Notes**: 
+    Notes:
         The number in x should be unique, otherwise it would cause potential errors.
-    Besides, we also support multi-edge-types neighbors reindexing. If we have different
-    edge_type neighbors for x, we should concatenate all the neighbors and count of x. 
-    We will reindex all the nodes from 0. 
+        Besides, we also support multi-edge-types neighbors reindexing. If we have different
+        edge_type neighbors for x, we should concatenate all the neighbors and count of x.
+        We will reindex all the nodes from 0.
 
     Take input nodes x = [0, 1, 2] as an example. 
     If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], 
@@ -58,98 +63,105 @@ def graph_reindex(x,
                             should be the same with `x`.
         count (Tensor): The neighbor count of the input nodes `x`. And the 
                         data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should 
-                                    be int32, and should be filled with -1.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should 
-                                    be int32, and should be filled with -1.
-        flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up.
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
+                                    be int32, and should be filled with -1. Default is None.
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
+                                    be int32, and should be filled with -1. Default is None.
+        flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
                                       Default is False. Only useful for gpu version currently.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
     
     Returns:
-        reindex_src (Tensor): The source node index of graph edges after reindex.
-        reindex_dst (Tensor): The destination node index of graph edges after reindex.
-        out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
-                            where we put the input nodes `x` in the front, and put neighbor
-                            nodes in the back.
+        - reindex_src (Tensor), The source node index of graph edges after reindex.
+        - reindex_dst (Tensor), The destination node index of graph edges after reindex.
+        - out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
+          where we put the input nodes `x` in the front, and put neighbor
+          nodes in the back.
 
     Examples:
-        
         .. code-block:: python
 
-        import paddle
-
-        x = [0, 1, 2]
-        neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
-        count_e1 = [2, 3, 2]
-        x = paddle.to_tensor(x, dtype="int64")
-        neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
-        count_e1 = paddle.to_tensor(count_e1, dtype="int32")
-
-        reindex_src, reindex_dst, out_nodes = \
-             paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
-        # reindex_src: [3, 4, 0, 5, 6, 7, 6]
-        # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
-        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
-
-        neighbors_e2 = [0, 2, 3, 5, 1]
-        count_e2 = [1, 3, 1]
-        neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
-        count_e2 = paddle.to_tensor(count_e2, dtype="int32")
-        
-        neighbors = paddle.concat([neighbors_e1, neighbors_e2])
-        count = paddle.concat([count_e1, count_e2])
-        reindex_src, reindex_dst, out_nodes = \
-             paddle.incubate.graph_reindex(x, neighbors, count)
-        # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
-        # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
-        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
+            import paddle
+
+            x = [0, 1, 2]
+            neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
+            count_e1 = [2, 3, 2]
+            x = paddle.to_tensor(x, dtype="int64")
+            neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
+            count_e1 = paddle.to_tensor(count_e1, dtype="int32")
+
+            reindex_src, reindex_dst, out_nodes = \
+                paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
+            # reindex_src: [3, 4, 0, 5, 6, 7, 6]
+            # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
+            # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
+
+            neighbors_e2 = [0, 2, 3, 5, 1]
+            count_e2 = [1, 3, 1]
+            neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
+            count_e2 = paddle.to_tensor(count_e2, dtype="int32")
+
+            neighbors = paddle.concat([neighbors_e1, neighbors_e2])
+            count = paddle.concat([count_e1, count_e2])
+            reindex_src, reindex_dst, out_nodes = \
+                paddle.incubate.graph_reindex(x, neighbors, count)
+            # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
+            # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
+            # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
 
     """
     if flag_buffer_hashtable:
         if value_buffer is None or index_buffer is None:
-            raise ValueError(f"`value_buffer` and `index_buffer` should not"
-                             "be None if `flag_buffer_hashtable` is True.")
+            raise ValueError(
+                f"`value_buffer` and `index_buffer` should not"
+                "be None if `flag_buffer_hashtable` is True."
+            )
 
     if _non_static_mode():
-        reindex_src, reindex_dst, out_nodes = \
-            _legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
-                                 "flag_buffer_hashtable", flag_buffer_hashtable)
+        reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
+            x,
+            neighbors,
+            count,
+            value_buffer,
+            index_buffer,
+            "flag_buffer_hashtable",
+            flag_buffer_hashtable,
+        )
         return reindex_src, reindex_dst, out_nodes
 
     check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
-    check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
-                             "graph_reindex")
+    check_variable_and_dtype(
+        neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
+    )
     check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
 
     if flag_buffer_hashtable:
-        check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
-                                 "graph_reindex")
-        check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
-                                 "graph_reindex")
+        check_variable_and_dtype(
+            value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
+        )
+        check_variable_and_dtype(
+            index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
+        )
 
     helper = LayerHelper("graph_reindex", **locals())
     reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
     reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="graph_reindex",
-                     inputs={
-                         "X":
-                         x,
-                         "Neighbors":
-                         neighbors,
-                         "Count":
-                         count,
-                         "HashTable_Value":
-                         value_buffer if flag_buffer_hashtable else None,
-                         "HashTable_Index":
-                         index_buffer if flag_buffer_hashtable else None,
-                     },
-                     outputs={
-                         "Reindex_Src": reindex_src,
-                         "Reindex_Dst": reindex_dst,
-                         "Out_Nodes": out_nodes
-                     },
-                     attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
+    helper.append_op(
+        type="graph_reindex",
+        inputs={
+            "X": x,
+            "Neighbors": neighbors,
+            "Count": count,
+            "HashTable_Value": value_buffer if flag_buffer_hashtable else None,
+            "HashTable_Index": index_buffer if flag_buffer_hashtable else None,
+        },
+        outputs={
+            "Reindex_Src": reindex_src,
+            "Reindex_Dst": reindex_dst,
+            "Out_Nodes": out_nodes,
+        },
+        attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
+    )
     return reindex_src, reindex_dst, out_nodes
diff --git a/python/paddle/incubate/operators/graph_sample_neighbors.py b/python/paddle/incubate/operators/graph_sample_neighbors.py
index b230b2a45d58dc2093e747006161c927b2b42020..980071b384b3f73c5d8858cd3fbb5a8c3bd50ed8 100644
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -25,17 +25,21 @@ import paddle.utils.deprecated as deprecated
     since="2.4.0",
     update_to="paddle.geometric.sample_neighbors",
     level=1,
-    reason="paddle.incubate.graph_sample_neighbors will be removed in future")
-def graph_sample_neighbors(row,
-                           colptr,
-                           input_nodes,
-                           eids=None,
-                           perm_buffer=None,
-                           sample_size=-1,
-                           return_eids=False,
-                           flag_perm_buffer=False,
-                           name=None):
+    reason="paddle.incubate.graph_sample_neighbors will be removed in future",
+)
+def graph_sample_neighbors(
+    row,
+    colptr,
+    input_nodes,
+    eids=None,
+    perm_buffer=None,
+    sample_size=-1,
+    return_eids=False,
+    flag_perm_buffer=False,
+    name=None,
+):
     """
+
     Graph Sample Neighbors API.
 
     This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -71,86 +75,109 @@ def graph_sample_neighbors(row,
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out_neighbors (Tensor): The sample neighbors of the input nodes.
-        out_count (Tensor): The number of sampling neighbors of each input node, and the shape
-                            should be the same with `input_nodes`.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the 
-                           sample edges.
+        - out_neighbors (Tensor), The sample neighbors of the input nodes.
+        - out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
+        - out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
 
     Examples:
         .. code-block:: python
-        import paddle
-        # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
-        #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
-        row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
-        colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
-        nodes = [0, 8, 1, 2]
-        sample_size = 2
-        row = paddle.to_tensor(row, dtype="int64")
-        colptr = paddle.to_tensor(colptr, dtype="int64")
-        nodes = paddle.to_tensor(nodes, dtype="int64")
-        out_neighbors, out_count = \
-            paddle.incubate.graph_sample_neighbors(row, colptr, nodes, 
-                                                   sample_size=sample_size)
+
+            import paddle
+            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
+            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
+            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+            nodes = [0, 8, 1, 2]
+            sample_size = 2
+            row = paddle.to_tensor(row, dtype="int64")
+            colptr = paddle.to_tensor(colptr, dtype="int64")
+            nodes = paddle.to_tensor(nodes, dtype="int64")
+            out_neighbors, out_count = \
+                paddle.incubate.graph_sample_neighbors(row, colptr, nodes,
+                                                    sample_size=sample_size)
 
     """
 
     if return_eids:
         if eids is None:
             raise ValueError(
-                f"`eids` should not be None if `return_eids` is True.")
+                f"`eids` should not be None if `return_eids` is True."
+            )
 
     if flag_perm_buffer:
         if perm_buffer is None:
             raise ValueError(
                 f"`perm_buffer` should not be None if `flag_perm_buffer`"
-                "is True.")
+                "is True."
+            )
 
     if _non_static_mode():
-        out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
-            row, colptr, input_nodes, eids, perm_buffer, "sample_size",
-            sample_size, "return_eids", return_eids, "flag_perm_buffer",
-            flag_perm_buffer)
+        (
+            out_neighbors,
+            out_count,
+            out_eids,
+        ) = _legacy_C_ops.graph_sample_neighbors(
+            row,
+            colptr,
+            input_nodes,
+            eids,
+            perm_buffer,
+            "sample_size",
+            sample_size,
+            "return_eids",
+            return_eids,
+            "flag_perm_buffer",
+            flag_perm_buffer,
+        )
         if return_eids:
             return out_neighbors, out_count, out_eids
         return out_neighbors, out_count
 
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
-                             "graph_sample_neighbors")
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
-                             "graph_sample_neighbors")
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
-                             "graph_sample_neighbors")
+    check_variable_and_dtype(
+        row, "Row", ("int32", "int64"), "graph_sample_neighbors"
+    )
+    check_variable_and_dtype(
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
+    )
     if return_eids:
-        check_variable_and_dtype(eids, "Eids", ("int32", "int64"),
-                                 "graph_sample_neighbors")
+        check_variable_and_dtype(
+            eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
+        )
     if flag_perm_buffer:
-        check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"),
-                                 "graph_sample_neighbors")
+        check_variable_and_dtype(
+            perm_buffer,
+            "Perm_Buffer",
+            ("int32", "int64"),
+            "graph_sample_neighbors",
+        )
 
     helper = LayerHelper("graph_sample_neighbors", **locals())
     out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
     out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
     out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_sample_neighbors",
-                     inputs={
-                         "Row": row,
-                         "Col_Ptr": colptr,
-                         "X": input_nodes,
-                         "Eids": eids if return_eids else None,
-                         "Perm_Buffer":
-                         perm_buffer if flag_perm_buffer else None
-                     },
-                     outputs={
-                         "Out": out_neighbors,
-                         "Out_Count": out_count,
-                         "Out_Eids": out_eids
-                     },
-                     attrs={
-                         "sample_size": sample_size,
-                         "return_eids": return_eids,
-                         "flag_perm_buffer": flag_perm_buffer
-                     })
+    helper.append_op(
+        type="graph_sample_neighbors",
+        inputs={
+            "Row": row,
+            "Col_Ptr": colptr,
+            "X": input_nodes,
+            "Eids": eids if return_eids else None,
+            "Perm_Buffer": perm_buffer if flag_perm_buffer else None,
+        },
+        outputs={
+            "Out": out_neighbors,
+            "Out_Count": out_count,
+            "Out_Eids": out_eids,
+        },
+        attrs={
+            "sample_size": sample_size,
+            "return_eids": return_eids,
+            "flag_perm_buffer": flag_perm_buffer,
+        },
+    )
     if return_eids:
         return out_neighbors, out_count, out_eids
     return out_neighbors, out_count
diff --git a/python/paddle/incubate/xpu/resnet_block.py b/python/paddle/incubate/xpu/resnet_block.py
index 39b439730759cf65cdc76ec449610a2128f9aff8..6c83f5bda498ab3a688378ed0006c82d91fe3e0d 100644
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
@@ -36,106 +36,232 @@ from paddle import _C_ops, _legacy_C_ops
 __all__ = ['resnet_basic_block', 'ResNetBasicBlock']
 
 
-def resnet_basic_block(x,
-                       filter1,
-                       scale1,
-                       bias1,
-                       mean1,
-                       var1,
-                       filter2,
-                       scale2,
-                       bias2,
-                       mean2,
-                       var2,
-                       filter3,
-                       scale3,
-                       bias3,
-                       mean3,
-                       var3,
-                       stride1,
-                       stride2,
-                       stride3,
-                       padding1,
-                       padding2,
-                       padding3,
-                       dilation1,
-                       dilation2,
-                       dilation3,
-                       groups,
-                       momentum,
-                       eps,
-                       data_format,
-                       has_shortcut,
-                       use_global_stats=None,
-                       training=False,
-                       trainable_statistics=False,
-                       find_conv_max=True):
+def resnet_basic_block(
+    x,
+    filter1,
+    scale1,
+    bias1,
+    mean1,
+    var1,
+    filter2,
+    scale2,
+    bias2,
+    mean2,
+    var2,
+    filter3,
+    scale3,
+    bias3,
+    mean3,
+    var3,
+    stride1,
+    stride2,
+    stride3,
+    padding1,
+    padding2,
+    padding3,
+    dilation1,
+    dilation2,
+    dilation3,
+    groups,
+    momentum,
+    eps,
+    data_format,
+    has_shortcut,
+    use_global_stats=None,
+    training=False,
+    trainable_statistics=False,
+    find_conv_max=True,
+):
 
     if fluid.framework.in_dygraph_mode():
-        attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3,
-                 'padding1', padding1, 'padding2', padding2, 'padding3',
-                 padding3, 'dilation1', dilation1, 'dilation2', dilation2,
-                 'dilation3', dilation3, 'group', groups, 'momentum', momentum,
-                 'epsilon', eps, 'data_format', data_format, 'has_shortcut',
-                 has_shortcut, 'use_global_stats', use_global_stats,
-                 "trainable_statistics", trainable_statistics, 'is_test',
-                 not training, 'act_type', "relu", 'find_conv_input_max',
-                 find_conv_max)
-
-        out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \
-                getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \
-                filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs)
+        attrs = (
+            'stride1',
+            stride1,
+            'stride2',
+            stride2,
+            'stride3',
+            stride3,
+            'padding1',
+            padding1,
+            'padding2',
+            padding2,
+            'padding3',
+            padding3,
+            'dilation1',
+            dilation1,
+            'dilation2',
+            dilation2,
+            'dilation3',
+            dilation3,
+            'group',
+            groups,
+            'momentum',
+            momentum,
+            'epsilon',
+            eps,
+            'data_format',
+            data_format,
+            'has_shortcut',
+            has_shortcut,
+            'use_global_stats',
+            use_global_stats,
+            "trainable_statistics",
+            trainable_statistics,
+            'is_test',
+            not training,
+            'act_type',
+            "relu",
+            'find_conv_input_max',
+            find_conv_max,
+        )
+
+        (
+            out,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+        ) = getattr(_C_ops, "resnet_basic_block")(
+            x,
+            filter1,
+            scale1,
+            bias1,
+            mean1,
+            var1,
+            filter2,
+            scale2,
+            bias2,
+            mean2,
+            var2,
+            filter3,
+            scale3,
+            bias3,
+            mean3,
+            var3,
+            mean1,
+            var1,
+            mean2,
+            var2,
+            mean3,
+            var3,
+            *attrs
+        )
         return out
     helper = LayerHelper('resnet_basic_block', **locals())
     bn_param_dtype = fluid.core.VarDesc.VarType.FP32
     max_dtype = fluid.core.VarDesc.VarType.FP32
 
-    out = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                    stop_gradient=True)
-    conv1 = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                      stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    conv1 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
     saved_mean1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
     saved_invstd1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
-    running_mean1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1
-    running_var1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1
-    conv2 = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                      stop_gradient=True)
-    conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                            stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
+    running_mean1 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if mean1 is None
+        else mean1
+    )
+    running_var1 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var1 is None
+        else var1
+    )
+    conv2 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    conv2_input = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
     saved_mean2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
     saved_invstd2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
-    running_mean2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2
-    running_var2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2
-    conv3 = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                      stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
+    running_mean2 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if mean2 is None
+        else mean2
+    )
+    running_var2 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var2 is None
+        else var2
+    )
+    conv3 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
     saved_mean3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
     saved_invstd3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
-    running_mean3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3
-    running_var3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3
+        dtype=bn_param_dtype, stop_gradient=True
+    )
+    running_mean3 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if mean3 is None
+        else mean3
+    )
+    running_var3 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var3 is None
+        else var3
+    )
     conv1_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv1_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv2_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv2_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv3_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv3_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
 
     inputs = {
         'X': x,
@@ -175,7 +301,7 @@ def resnet_basic_block(x,
         "trainable_statistics": trainable_statistics,
         'is_test': not training,
         'act_type': "relu",
-        'find_conv_input_max': find_conv_max
+        'find_conv_input_max': find_conv_max,
     }
 
     outputs = {
@@ -203,88 +329,172 @@ def resnet_basic_block(x,
         'MaxInput3': conv3_input_max,
         'MaxFilter3': conv3_filter_max,
     }
-    helper.append_op(type='resnet_basic_block',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='resnet_basic_block', inputs=inputs, outputs=outputs, attrs=attrs
+    )
     return out
 
 
 class ResNetBasicBlock(Layer):
-    """
+    r"""
+
     ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
-    The fusion op architecture like this:
-            has_shortcut = True:       else:
-                    X                         X
-                  /                         /
-                |       |                 |       |
-              CONV1     |               CONV1     |
-                |       |                 |       |
-               BN1      |                BN1      |
-                |       |                 |       |
-              RELU1     |               RELU1     |
-                |       |                 |       |
-              CONV2   CONV3             CONV2     |
-                |       |                 |       |
-               BN2     BN3               BN2      |
-                 \     /                   \     /
-                   ADD                       ADD
-                    |                         |
-                   RELU                      RELU
-                    |                         |
-                    Y                         Y
+    If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time.
+    If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this
+    case the shape of output is same with input.
+
+
+    Args:
+        num_channels (int): The number of input image channel.
+        num_filter (int): The number of filter. It is as same as the output image channel.
+        filter_size (int|list|tuple): The filter size. If filter_size
+            is a tuple, it must contain two integers, (filter_size_height,
+            filter_size_width). Otherwise, filter_size_height = filter_size_width =\
+            filter_size.
+        stride (int, optional): The stride size. It means the stride in convolution.
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width).
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
+        act (str, optional): Activation type, if it is set to None, activation is not appended.
+            Default: None
+        momentum (float, optional): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a Tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. Now is only support `"NCHW"`, the data is stored in
+            the order of: `[batch_size, input_channels, input_height, input_width]`.
+        has_shortcut (bool, optional): Whether to calculate CONV3 and BN3. Default: False.
+        use_global_stats (bool, optional): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        is_test (bool, optional): A flag indicating whether it is in
+            test phrase or not. Default: False.
+        filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. Default: None.
+        scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr
+            as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set,
+            the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero.
+            Default: None.
+        moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string. Default: None.
+        moving_var_name (str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string. Default: None.
+        padding (int, optional): The padding size. It is only spupport padding_height = padding_width = padding.
+            Default: padding = 0.
+        dilation (int, optional): The dilation size. It means the spacing between the kernel
+            points. It is only spupport dilation_height = dilation_width = dilation.
+            Default: dilation = 1.
+        trainable_statistics (bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
+            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
+            Default: False.
+        find_conv_max (bool, optional): Whether to calculate max value of each conv2d. Default: True.
+
+
+    Returns:
+        A Tensor representing the ResNetBasicBlock, whose data type is the same with input.
+
+
+    Examples:
+        .. code-block:: python
+
+            # required: xpu
+            import paddle
+            from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
+
+            ch_in = 4
+            ch_out = 8
+            x = paddle.uniform((2, ch_in, 16, 16), dtype='float32', min=-1., max=1.)
+            resnet_basic_block = ResNetBasicBlock(num_channels1=ch_in,
+                                                num_filter1=ch_out,
+                                                filter1_size=3,
+                                                num_channels2=ch_out,
+                                                num_filter2=ch_out,
+                                                filter2_size=3,
+                                                num_channels3=ch_in,
+                                                num_filter3=ch_out,
+                                                filter3_size=1,
+                                                stride1=1,
+                                                stride2=1,
+                                                stride3=1,
+                                                act='relu',
+                                                padding1=1,
+                                                padding2=1,
+                                                padding3=0,
+                                                has_shortcut=True)
+            out = resnet_basic_block.forward(x)
+
+            print(out.shape) # [2, 8, 16, 16]
+
     """
 
-    def __init__(self,
-                 num_channels1,
-                 num_filter1,
-                 filter1_size,
-                 num_channels2,
-                 num_filter2,
-                 filter2_size,
-                 num_channels3,
-                 num_filter3,
-                 filter3_size,
-                 stride1=1,
-                 stride2=1,
-                 stride3=1,
-                 act='relu',
-                 momentum=0.9,
-                 eps=1e-5,
-                 data_format='NCHW',
-                 has_shortcut=False,
-                 use_global_stats=False,
-                 is_test=False,
-                 filter1_attr=None,
-                 scale1_attr=None,
-                 bias1_attr=None,
-                 moving_mean1_name=None,
-                 moving_var1_name=None,
-                 filter2_attr=None,
-                 scale2_attr=None,
-                 bias2_attr=None,
-                 moving_mean2_name=None,
-                 moving_var2_name=None,
-                 filter3_attr=None,
-                 scale3_attr=None,
-                 bias3_attr=None,
-                 moving_mean3_name=None,
-                 moving_var3_name=None,
-                 padding1=0,
-                 padding2=0,
-                 padding3=0,
-                 dilation1=1,
-                 dilation2=1,
-                 dilation3=1,
-                 trainable_statistics=False,
-                 find_conv_max=True):
+    def __init__(
+        self,
+        num_channels1,
+        num_filter1,
+        filter1_size,
+        num_channels2,
+        num_filter2,
+        filter2_size,
+        num_channels3,
+        num_filter3,
+        filter3_size,
+        stride1=1,
+        stride2=1,
+        stride3=1,
+        act='relu',
+        momentum=0.9,
+        eps=1e-5,
+        data_format='NCHW',
+        has_shortcut=False,
+        use_global_stats=False,
+        is_test=False,
+        filter1_attr=None,
+        scale1_attr=None,
+        bias1_attr=None,
+        moving_mean1_name=None,
+        moving_var1_name=None,
+        filter2_attr=None,
+        scale2_attr=None,
+        bias2_attr=None,
+        moving_mean2_name=None,
+        moving_var2_name=None,
+        filter3_attr=None,
+        scale3_attr=None,
+        bias3_attr=None,
+        moving_mean3_name=None,
+        moving_var3_name=None,
+        padding1=0,
+        padding2=0,
+        padding3=0,
+        dilation1=1,
+        dilation2=1,
+        dilation3=1,
+        trainable_statistics=False,
+        find_conv_max=True,
+    ):
         super(ResNetBasicBlock, self).__init__()
         self._stride1 = stride1
         self._stride2 = stride2
-        self._kernel1_size = utils.convert_to_list(filter1_size, 2,
-                                                   'filter1_size')
-        self._kernel2_size = utils.convert_to_list(filter2_size, 2,
-                                                   'filter2_size')
+        self._kernel1_size = utils.convert_to_list(
+            filter1_size, 2, 'filter1_size'
+        )
+        self._kernel2_size = utils.convert_to_list(
+            filter2_size, 2, 'filter2_size'
+        )
         self._dilation1 = dilation1
         self._dilation2 = dilation2
         self._padding1 = padding1
@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer):
         self._find_conv_max = find_conv_max
 
         if has_shortcut:
-            self._kernel3_size = utils.convert_to_list(filter3_size, 2,
-                                                       'filter3_size')
+            self._kernel3_size = utils.convert_to_list(
+                filter3_size, 2, 'filter3_size'
+            )
             self._padding3 = padding3
             self._stride3 = stride3
             self._dilation3 = dilation3
@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer):
         if data_format not in valid_format:
             raise ValueError(
                 "conv_format must be one of {}, but got conv_format={}".format(
-                    valid_format, data_format))
+                    valid_format, data_format
+                )
+            )
 
         def _get_default_param_initializer(channels, kernel_size):
             filter_elem_num = np.prod(kernel_size) * channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
             return I.Normal(0.0, std)
 
         # init filter
@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer):
             shape=filter1_shape,
             attr=filter1_attr,
             default_initializer=_get_default_param_initializer(
-                num_channels1, self._kernel1_size))
+                num_channels1, self._kernel1_size
+            ),
+        )
         self.scale_1 = self.create_parameter(
             shape=bn1_param_shape,
             attr=scale1_attr,
             dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0))
-        self.bias_1 = self.create_parameter(shape=bn1_param_shape,
-                                            attr=bias1_attr,
-                                            dtype=bn_param_dtype,
-                                            is_bias=True)
-        self.mean_1 = self.create_parameter(attr=ParamAttr(
-            name=moving_mean1_name,
-            initializer=I.Constant(0.0),
-            trainable=False),
-                                            shape=bn1_param_shape,
-                                            dtype=bn_param_dtype)
+            default_initializer=I.Constant(1.0),
+        )
+        self.bias_1 = self.create_parameter(
+            shape=bn1_param_shape,
+            attr=bias1_attr,
+            dtype=bn_param_dtype,
+            is_bias=True,
+        )
+        self.mean_1 = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean1_name,
+                initializer=I.Constant(0.0),
+                trainable=False,
+            ),
+            shape=bn1_param_shape,
+            dtype=bn_param_dtype,
+        )
         self.mean_1.stop_gradient = True
         self.var_1 = self.create_parameter(
-            attr=ParamAttr(name=moving_var1_name,
-                           initializer=I.Constant(1.0),
-                           trainable=False),
+            attr=ParamAttr(
+                name=moving_var1_name,
+                initializer=I.Constant(1.0),
+                trainable=False,
+            ),
             shape=bn1_param_shape,
-            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
         self.var_1.stop_gradient = True
 
         self.filter_2 = self.create_parameter(
             shape=filter2_shape,
             attr=filter2_attr,
             default_initializer=_get_default_param_initializer(
-                num_channels2, self._kernel2_size))
+                num_channels2, self._kernel2_size
+            ),
+        )
         self.scale_2 = self.create_parameter(
             shape=bn2_param_shape,
             attr=scale2_attr,
             dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0))
-        self.bias_2 = self.create_parameter(shape=bn2_param_shape,
-                                            attr=bias2_attr,
-                                            dtype=bn_param_dtype,
-                                            is_bias=True)
-        self.mean_2 = self.create_parameter(attr=ParamAttr(
-            name=moving_mean2_name,
-            initializer=I.Constant(0.0),
-            trainable=False),
-                                            shape=bn2_param_shape,
-                                            dtype=bn_param_dtype)
+            default_initializer=I.Constant(1.0),
+        )
+        self.bias_2 = self.create_parameter(
+            shape=bn2_param_shape,
+            attr=bias2_attr,
+            dtype=bn_param_dtype,
+            is_bias=True,
+        )
+        self.mean_2 = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean2_name,
+                initializer=I.Constant(0.0),
+                trainable=False,
+            ),
+            shape=bn2_param_shape,
+            dtype=bn_param_dtype,
+        )
         self.mean_2.stop_gradient = True
         self.var_2 = self.create_parameter(
-            attr=ParamAttr(name=moving_var2_name,
-                           initializer=I.Constant(1.0),
-                           trainable=False),
+            attr=ParamAttr(
+                name=moving_var2_name,
+                initializer=I.Constant(1.0),
+                trainable=False,
+            ),
             shape=bn2_param_shape,
-            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
         self.var_2.stop_gradient = True
 
         if has_shortcut:
             bn3_param_shape = [1, 1, num_filter3]
             filter3_shape = [
-                num_filter3, num_channels3, filter3_size, filter3_size
+                num_filter3,
+                num_channels3,
+                filter3_size,
+                filter3_size,
             ]
             self.filter_3 = self.create_parameter(
                 shape=filter3_shape,
                 attr=filter3_attr,
                 default_initializer=_get_default_param_initializer(
-                    num_channels3, self._kernel3_size))
+                    num_channels3, self._kernel3_size
+                ),
+            )
             self.scale_3 = self.create_parameter(
                 shape=bn3_param_shape,
                 attr=scale3_attr,
                 dtype=bn_param_dtype,
-                default_initializer=I.Constant(1.0))
-            self.bias_3 = self.create_parameter(shape=bn3_param_shape,
-                                                attr=bias3_attr,
-                                                dtype=bn_param_dtype,
-                                                is_bias=True)
-            self.mean_3 = self.create_parameter(attr=ParamAttr(
-                name=moving_mean3_name,
-                initializer=I.Constant(0.0),
-                trainable=False),
-                                                shape=bn3_param_shape,
-                                                dtype=bn_param_dtype)
+                default_initializer=I.Constant(1.0),
+            )
+            self.bias_3 = self.create_parameter(
+                shape=bn3_param_shape,
+                attr=bias3_attr,
+                dtype=bn_param_dtype,
+                is_bias=True,
+            )
+            self.mean_3 = self.create_parameter(
+                attr=ParamAttr(
+                    name=moving_mean3_name,
+                    initializer=I.Constant(0.0),
+                    trainable=False,
+                ),
+                shape=bn3_param_shape,
+                dtype=bn_param_dtype,
+            )
             self.mean_3.stop_gradient = True
-            self.var_3 = self.create_parameter(attr=ParamAttr(
-                name=moving_var3_name,
-                initializer=I.Constant(1.0),
-                trainable=False),
-                                               shape=bn3_param_shape,
-                                               dtype=bn_param_dtype)
+            self.var_3 = self.create_parameter(
+                attr=ParamAttr(
+                    name=moving_var3_name,
+                    initializer=I.Constant(1.0),
+                    trainable=False,
+                ),
+                shape=bn3_param_shape,
+                dtype=bn_param_dtype,
+            )
             self.var_3.stop_gradient = True
         else:
             self.filter_3 = None
@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer):
             use_global_stats=self._use_global_stats,
             training=self.training,
             trainable_statistics=self._trainable_statistics,
-            find_conv_max=self._find_conv_max)
+            find_conv_max=self._find_conv_max,
+        )
         return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 747df9baf5d0bc4fb9a2c7b5da9784f9e441dbff..011acc3096cccc0398a4077a737be48ffcac4dbf 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -715,6 +715,7 @@ def upsample(
     name=None,
 ):
     """
+
     This API resizes a batch of images.
 
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
@@ -725,11 +726,12 @@ def upsample(
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
-        'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
-        'trilinear' : Trilinear interpolation
-        'nearest' : Nearest neighbor interpolation
-        'bicubic' : Bicubic interpolation
+    - 'linear' : Linear interpolation
+    - 'bilinear' : Bilinear interpolation
+    - 'trilinear' : Trilinear interpolation
+    - 'nearest' : Nearest neighbor interpolation
+    - 'bicubic' : Bicubic interpolation
+
     Linear interpolation is the method of using a line connecting two known quantities
     to determine the value of an unknown quantity between the two known quantities.
 
@@ -762,77 +764,78 @@ def upsample(
     `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
 
     Example:
-    .. code-block:: text
+        .. code-block:: text
 
-        For scale_factor:
-            if align_corners = True && out_size > 1 :
-              scale_factor = (in_size-1.0)/(out_size-1.0)
+            For scale_factor:
+                if align_corners = True && out_size > 1 :
+                scale_factor = (in_size-1.0)/(out_size-1.0)
+                else:
+                scale_factor = float(in_size/out_size)
+            Linear interpolation:
+                if:
+                    align_corners = False , align_mode = 0
+                    input : (N,C,W_in)
+                    output: (N,C,W_out) where:
+                    W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+                else:
+                    input : (N,C,W_in)
+                    output: (N,C,W_out) where:
+                    W_out = W_{in} * scale_{factor}
+            Nearest neighbor interpolation:
+            if:
+                align_corners = False
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = floor (H_{in} * scale_{factor})
+                W_out = floor (W_{in} * scale_{factor})
             else:
-              scale_factor = float(in_size/out_size)
-        Linear interpolation:
+                align_corners = True
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = round(H_{in} * scale_{factor})
+                W_out = round(W_{in} * scale_{factor})
+
+            Bilinear interpolation:
             if:
                 align_corners = False , align_mode = 0
-                input : (N,C,W_in)
-                output: (N,C,W_out) where:
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
                 W_out = (W_{in}+0.5) * scale_{factor} - 0.5
             else:
-                input : (N,C,W_in)
-                output: (N,C,W_out) where:
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = H_{in} * scale_{factor}
+                W_out = W_{in} * scale_{factor}
+            Bicubic interpolation:
+            if:
+                align_corners = False
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = H_{in} * scale_{factor}
+                W_out = W_{in} * scale_{factor}
+            Trilinear interpolation:
+            if:
+                align_corners = False , align_mode = 0
+                input : (N,C,D_in,H_in,W_in)
+                output: (N,C,D_out,H_out,W_out) where:
+                D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,D_in,H_in,W_in)
+                output: (N,C,D_out,H_out,W_out) where:
+                D_out = D_{in} * scale_{factor}
+                H_out = H_{in} * scale_{factor}
                 W_out = W_{in} * scale_{factor}
-        Nearest neighbor interpolation:
-          if:
-              align_corners = False
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = floor (H_{in} * scale_{factor})
-              W_out = floor (W_{in} * scale_{factor})
-          else:
-              align_corners = True
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
 
-        Bilinear interpolation:
-          if:
-              align_corners = False , align_mode = 0
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-        Bicubic interpolation:
-          if:
-              align_corners = False
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-        Trilinear interpolation:
-          if:
-              align_corners = False , align_mode = 0
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-    https://en.wikipedia.org/wiki/Linear_interpolation.
     For details of linear interpolation, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Linear_interpolation.
 
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
@@ -876,23 +879,24 @@ def upsample(
         name(str, optional): The default value is None.
                              Normally there is no need for user to set this property.
                              For more information, please refer to :ref:`api_guide_Name`
+
     Returns:
         A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
 
-        Examples:
+    Examples:
         .. code-block:: python
 
-                import paddle
-                import paddle.nn as nn
+            import paddle
+            import paddle.nn as nn
 
-                input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
-                upsample_out = paddle.nn.Upsample(size=[12,12])
+            input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+            upsample_out = paddle.nn.Upsample(size=[12,12])
 
-                output = upsample_out(x=input_data)
-                print(output.shape)
-                # [2L, 3L, 12L, 12L]
+            output = upsample_out(x=input_data)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
 
     """
     return interpolate(
diff --git a/python/paddle/nn/functional/distance.py b/python/paddle/nn/functional/distance.py
index 1c29d509741018623ea3436c8116fc984053fbb1..0c3a1a8b0d72a429e3ad0d48b2640e8e328ceb79 100644
--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -23,6 +23,7 @@ __all__ = []
 
 def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
     r"""
+
     It computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
 
@@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
 
     Returns:
         Tensor, the dtype is same as input tensor.
+
         - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
-            depending on whether the input has data shaped as :math:`[N, D]`.
+          depending on whether the input has data shaped as :math:`[N, D]`.
         - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
-            depending on whether the input has data shaped as :math:`[N, D]`.
+          depending on whether the input has data shaped as :math:`[N, D]`.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f5b6965544fadf1a66b8041b3e31e8b94eee6931..4bb19343c13a64325ab6216d49f78cac3881b74c 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1310,6 +1310,7 @@ def margin_ranking_loss(
 
 def l1_loss(input, label, reduction='mean', name=None):
     r"""
+
     Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
 
     If `reduction` set to ``'none'``, the loss is:
@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None):
 
     Returns:
         Tensor, the L1 Loss of Tensor ``input`` and ``label``.
-        If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+        If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
         If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
 
     Examples:
@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None):
             l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
             print(l1_loss.numpy())
             # [1.4]
+
     """
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
@@ -2286,6 +2288,7 @@ def cross_entropy(
     name=None,
 ):
     r"""
+
     By default, this operator implements the cross entropy loss function with softmax. This function
     combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable computing.
@@ -2399,21 +2402,13 @@ def cross_entropy(
 
 
     Parameters:
-
-        - **input** (Tensor)
-
-            Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
+        input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
 
             Note:
-
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
-                output of softmax operator, which will produce incorrect results.
-
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
                 2. when use_softmax=False, it expects the output of softmax operator.
 
-        - **label** (Tensor)
-
+        label (Tensor):
             1. If soft_label=False, the shape is
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
@@ -2421,48 +2416,27 @@ def cross_entropy(
             2. If soft_label=True, the shape and data type should be same with ``input`` ,
             and the sum of the labels for each sample should be 1.
 
-        - **weight** (Tensor, optional)
-
-            a manual rescaling weight given to each class.
+        weight (Tensor, optional): a manual rescaling weight given to each class.
             If given, has to be a Tensor of size C and the data type is float32, float64.
             Default is ``'None'`` .
-
-        - **ignore_index** (int64, optional)
-
-            Specifies a target value that is ignored
+        ignore_index (int64, optional): Specifies a target value that is ignored
             and does not contribute to the loss. A negative value means that no label
             value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
-
-        - **reduction** (str, optional)
-
-            Indicate how to average the loss by batch_size,
+        reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-
-        - **soft_label** (bool, optional)
-
-            Indicate whether label is soft.
-            Default is ``False``.
-
-        - **axis** (int, optional)
-
-            The index of dimension to perform softmax calculations.
+        soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
+        axis (int, optional):The index of dimension to perform softmax calculations.
             It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
             number of dimensions of input :attr:`input`.
             Default is ``-1`` .
-
-        - **use_softmax** (bool, optional)
-
-            Indicate whether compute softmax before cross_entropy.
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
             Default is ``True``.
-
-        - **name** (str, optional)
-
-            The name of the operator. Default is ``None`` .
+        name (str, optional): The name of the operator. Default is ``None`` .
             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
@@ -2478,9 +2452,7 @@ def cross_entropy(
 
         2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
 
-
     Examples:
-
         .. code-block:: python
 
             # hard labels
@@ -3834,6 +3806,7 @@ def triplet_margin_loss(
 
 def soft_margin_loss(input, label, reduction='mean', name=None):
     """
+
     The API measures the soft margin loss between input predictions ``input``
     and target labels ``label`` . It can be described as:
 
@@ -3842,9 +3815,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
 
     Parameters:
 
-        input (Tensor): The input predications tensor with shape: [N, *],
+        input (Tensor): The input predications tensor with shape: ``[N, *]``,
             N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
-             Available dtype is float32, float64.
+            Available dtype is float32, float64.
 
         label (Tensor): The target labels tensor with the same shape as
             ``input``. The target labels which values should be numbers -1 or 1.
@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
 
     Returns:
 
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``input`` , else the shape of output is [1].
+        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
 
     Examples:
         .. code-block:: python
@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
             #         [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
             #         [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
             #         [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
+
     """
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index c64a79a43467d3c5543159d8eb7ae77754731d32..388ab4c6944cc01c80c2e4ca8988573b57dc31b3 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1735,16 +1735,18 @@ def adaptive_avg_pool1d(x, output_size, name=None):
 
 
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
-    """
+    r"""
+
     Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
 
     For avg adaptive pool2d:
+
     ..  math::
-        hstart &= floor(i * H_{in} / H_{out})
-        hend &= ceil((i + 1) * H_{in} / H_{out})
-        wstart &= floor(j * W_{in} / W_{out})
-        wend &= ceil((j + 1) * W_{in} / W_{out})
+        hstart &= floor(i * H_{in} / H_{out}) \\
+        hend &= ceil((i + 1) * H_{in} / H_{out}) \\
+        wstart &= floor(j * W_{in} / W_{out}) \\
+        wend &= ceil((j + 1) * W_{in} / W_{out}) \\
         Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
 
     Args:
@@ -1753,14 +1755,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two element, (H, W). H and W can be either a int, or None which means
             the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format (str, optional): The data format of the input and output data. An optional string
             from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
             the order of: [batch_size, input_channels, input_height, input_width].
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
+
     Returns:
-        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
 
     Examples:
         .. code-block:: python
@@ -1788,6 +1791,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
                             x = x,
                             output_size=[3, 3])
             # out.shape is [2, 3, 3, 3]
+
     """
     if not in_dynamic_mode():
         check_variable_and_dtype(
@@ -1879,35 +1883,37 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 
 
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
-    """
+    r"""
+
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
 
     For avg adaptive pool3d:
+
     ..  math::
-        dstart &= floor(i * D_{in} / D_{out})
-        dend &= ceil((i + 1) * D_{in} / D_{out})
-        hstart &= floor(j * H_{in} / H_{out})
-        hend &= ceil((j + 1) * H_{in} / H_{out})
-        wstart &= floor(k * W_{in} / W_{out})
-        wend &= ceil((k + 1) * W_{in} / W_{out})
+        dstart &= floor(i * D_{in} / D_{out}) \\
+        dend &= ceil((i + 1) * D_{in} / D_{out}) \\
+        hstart &= floor(j * H_{in} / H_{out}) \\
+        hend &= ceil((j + 1) * H_{in} / H_{out}) \\
+        wstart &= floor(k * W_{in} / W_{out}) \\
+        wend &= ceil((k + 1) * W_{in} / W_{out}) \\
         Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
             {(dend - dstart) * (hend - hstart) * (wend - wstart)}
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
-                          The data type can be float32, float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+            The data type can be float32, float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or
+            list, it must contain three elements, (D, H, W). D, H and W can be either a int,
+            or None which means the size will be the same as that of the input.
+        data_format (str, optional): The data format of the input and output data. An optional string
             from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
             the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
     Returns:
-        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
 
     Examples:
         .. code-block:: python
@@ -1937,6 +1943,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
                             x = input_data,
                             output_size=[3, 3, 3])
             # out.shape is [2, 3, 3, 3, 3]
+
     """
     if not in_dynamic_mode():
         check_variable_and_dtype(
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 27e18e73842c39ffdda91c540c781ee9e676cd05..6736a9a6128627319ab248ccccbdc348b13a4e14 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -1450,15 +1450,16 @@ class Maxout(Layer):
 
 class Softmax2D(Layer):
     r"""
+
     Softmax2D Activation.
     Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
     The sum of result in each location (C, H_i, W_j) will be one.
 
     Shape:
         - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
-        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input)
+        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)
 
-    Return:
+    Returns:
         A Tensor of the same shape and dtype as input with value in range [0, 1].
 
     Examples:
@@ -1483,6 +1484,7 @@ class Softmax2D(Layer):
             #   [[0.42368975 0.51082766 0.47752273 0.5258871 ]
             #    [0.66754097 0.47182566 0.5187628  0.5402329 ]
             #    [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
+
     """
 
     def __init__(self, name=None):
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index a7a488c833d7ffaa88634b729938c57ef1c55452..98381b471d6f346e53913bb554ec1ef8e3dbe7b4 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -20,6 +20,7 @@ __all__ = []
 
 class PairwiseDistance(Layer):
     r"""
+
     It computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
 
@@ -38,14 +39,14 @@ class PairwiseDistance(Layer):
             Generally, no setting is required. Default: None.
 
     Shape:
-        x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
-            is the dimension of the data. Available data type is float32, float64.
-        y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
-        output: The same dtype as input tensor.
+        - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
+          is the dimension of the data. Available data type is float32, float64.
+        - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
+        - output: The same dtype as input tensor.
             - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
-                depending on whether the input has data shaped as :math:`[N, D]`.
+              depending on whether the input has data shaped as :math:`[N, D]`.
             - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
-                depending on whether the input has data shaped as :math:`[N, D]`.
+              depending on whether the input has data shaped as :math:`[N, D]`.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index a80d3706fbfde55dcfc2e458c23af40e9503b617..51706ee336f6852bde004fbe93c20460b72cb96a 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -31,7 +31,8 @@ __all__ = []
 
 class BCEWithLogitsLoss(Layer):
     r"""
-    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+
+    This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
     layer and some reduce operations.
 
@@ -54,7 +55,7 @@ class BCEWithLogitsLoss(Layer):
     For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
     we reformulate the loss as follows:
 
-    .. math::
+        .. math::
            Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
 
     Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
@@ -86,21 +87,21 @@ class BCEWithLogitsLoss(Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shapes:
-        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
-            N is batch_size, `*` means number of additional dimensions. The ``logit``
-            is usually the output of Linear layer. Available dtype is float32, float64.
-        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
-            ``logit``. The target labels which values should be numbers between 0 and 1.
-            Available dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``logit`` , else the shape of output is scalar.
+        - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
+          N is batch_size, `*` means number of additional dimensions. The ``logit``
+          is usually the output of Linear layer. Available dtype is float32, float64.
+        - label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+          ``logit``. The target labels which values should be numbers between 0 and 1.
+          Available dtype is float32, float64.
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``logit`` , else the shape of output is scalar.
 
     Returns:
         A callable object of BCEWithLogitsLoss.
 
     Examples:
-
         .. code-block:: python
+
             import paddle
             logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
             label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
@@ -139,6 +140,7 @@ class BCEWithLogitsLoss(Layer):
 
 class CrossEntropyLoss(Layer):
     r"""
+
     By default, this operator implements the cross entropy loss function with softmax. This function
     combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable computing.
@@ -251,60 +253,35 @@ class CrossEntropyLoss(Layer):
 
 
     Parameters:
-
-        - **weight** (Tensor, optional)
-
-            a manual rescaling weight given to each class.
+        weight (Tensor, optional): a manual rescaling weight given to each class.
             If given, has to be a Tensor of size C and the data type is float32, float64.
             Default is ``'None'`` .
-
-        - **ignore_index** (int64, optional)
-
-            Specifies a target value that is ignored
+        ignore_index (int64, optional): Specifies a target value that is ignored
             and does not contribute to the loss. A negative value means that no label
             value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
-
-        - **reduction** (str, optional)
-
-            Indicate how to average the loss by batch_size,
+        reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-
-        - **soft_label** (bool, optional)
-
-            Indicate whether label is soft.
+        soft_label (bool, optional): Indicate whether label is soft.
             If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
             Default is ``False``.
-
-        - **axis** (int, optional)
-
-            The index of dimension to perform softmax calculations.
+        axis (int, optional): The index of dimension to perform softmax calculations.
             It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
             of dimensions of input :attr:`input`.
             Default is ``-1`` .
-
-        - **use_softmax** (bool, optional)
-
-            Indicate whether compute softmax before cross_entropy.
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
             Default is ``True``.
-
-        - **name** (str, optional)
-
-            The name of the operator. Default is ``None`` .
+        name (str, optional): The name of the operator. Default is ``None`` .
             For more information, please refer to :ref:`api_guide_Name` .
 
 
     Shape:
-
-        - **input** (Tensor)
-
-            Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
-
+        - **input** (Tensor), the data type is float32, float64. Shape is
+          :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
             Note:
 
                 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
@@ -312,7 +289,6 @@ class CrossEntropyLoss(Layer):
 
                 2. when use_softmax=False, it expects the output of softmax operator.
 
-
         - **label** (Tensor)
 
             1. If soft_label=False, the shape is
@@ -322,15 +298,10 @@ class CrossEntropyLoss(Layer):
             2. If soft_label=True, the shape and data type should be same with ``input`` ,
             and the sum of the labels for each sample should be 1.
 
-        - **output** (Tensor)
-
-            Return the softmax cross_entropy loss of ``input`` and ``label``.
-
-            The data type is the same as input.
-
-            If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
-
-            If :attr:`reduction` is ``'none'``:
+        - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
+          The data type is the same as input.
+          If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
+          If :attr:`reduction` is ``'none'``:
 
             1. If soft_label = False, the dimension of return value is the same with ``label`` .
 
@@ -634,6 +605,7 @@ class MSELoss(Layer):
 
 class L1Loss(Layer):
     r"""
+
     Construct a callable object of the ``L1Loss`` class.
     The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
 
@@ -663,11 +635,11 @@ class L1Loss(Layer):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
-        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
-        output (Tensor): The L1 Loss of ``input`` and ``label``.
-            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
-            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+        - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        - output (Tensor): The L1 Loss of ``input`` and ``label``.
+          If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
+          If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
 
     Examples:
         .. code-block:: python
@@ -692,6 +664,7 @@ class L1Loss(Layer):
             print(output)
             # [[0.20000005 0.19999999]
             # [0.2        0.79999995]]
+
     """
 
     def __init__(self, reduction='mean', name=None):
@@ -712,6 +685,7 @@ class L1Loss(Layer):
 
 class BCELoss(Layer):
     """
+
     This interface is used to construct a callable object of the ``BCELoss`` class.
     The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
     and target labels ``label`` . The binary_cross_entropy loss can be described as:
@@ -755,14 +729,14 @@ class BCELoss(Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-        input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means
-            number of additional dimensions. The input ``input`` should always
-            be the output of sigmod.  Available dtype is float32, float64.
-        label (Tensor): 2-D tensor with the same shape as ``input``. The target
-            labels which values should be numbers between 0 and 1. Available
-            dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``input`` , else the shape of output is scalar.
+        - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
+          number of additional dimensions. The input ``input`` should always
+          be the output of sigmod.  Available dtype is float32, float64.
+        - label (Tensor): 2-D tensor with the same shape as ``input``. The target
+          labels which values should be numbers between 0 and 1. Available
+          dtype is float32, float64.
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``input`` , else the shape of output is scalar.
 
     Returns:
         A callable object of BCELoss.
@@ -855,7 +829,7 @@ class NLLLoss(Layer):
             if `reduction` is ``'sum'``, the reduced sum loss is returned;
             if `reduction` is ``'none'``, no reduction will be apllied.
             Default is ``'mean'``.
-         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Shape:
         - input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
@@ -914,6 +888,7 @@ class NLLLoss(Layer):
 
 class KLDivLoss(Layer):
     r"""
+
     Generate a callable object of 'KLDivLoss' to calculate the
     Kullback-Leibler divergence loss between Input(X) and
     Input(Target). Notes that Input(X) is the log-probability
@@ -933,14 +908,10 @@ class KLDivLoss(Layer):
              Default is ``'mean'``.
 
     Shape:
-
-        - input (Tensor): (N, *), where * means, any number of additional dimensions.
-
-        - label (Tensor): (N, *), same shape as input.
-
+        - input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
+        - label (Tensor): ``(N, *)``, same shape as input.
         - output (Tensor): tensor with shape: [1] by default.
 
-
     Examples:
         .. code-block:: python
 
@@ -970,6 +941,7 @@ class KLDivLoss(Layer):
             kldiv_criterion = nn.KLDivLoss(reduction='none')
             pred_loss = kldiv_criterion(x, target)
             # shape=[5, 20]
+
     """
 
     def __init__(self, reduction='mean'):
@@ -1720,6 +1692,7 @@ class TripletMarginLoss(Layer):
 
 class SoftMarginLoss(Layer):
     r"""
+
     Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
     and target labels ``label`` . It can be described as:
 
@@ -1738,17 +1711,14 @@ class SoftMarginLoss(Layer):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shapes:
-
-        Input (Tensor): The input tensor with shape: [N, *],
-        N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
-        Available dtype is float32, float64.
-
-        Label (Tensor): The target labels tensor with the same shape as
-        ``input``. The target labels which values should be numbers -1 or 1.
-        Available dtype is int32, int64, float32, float64.
-
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``input`` , else the shape of output is [1].
+        - Input (Tensor): The input tensor with shape: ``[N, *]``,
+          N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
+          Available dtype is float32, float64.
+        - Label (Tensor): The target labels tensor with the same shape as
+          ``input``. The target labels which values should be numbers -1 or 1.
+          Available dtype is int32, int64, float32, float64.
+        - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``input`` , else the shape of output is [1].
 
     Returns:
         A callable object of SoftMarginLoss.
@@ -1780,6 +1750,7 @@ class SoftMarginLoss(Layer):
             #         [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511],
             #         [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399],
             #         [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]])
+
     """
 
     def __init__(self, reduction='mean', name=None):
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 1bf7fb179e5d8770531447bca9e6e2fe38434063..d359f576dd6a9adf3166f14289e6d633bd647aa9 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -321,6 +321,7 @@ Where `H` means height of feature map, `W` means width of feature map.
 
 class GroupNorm(Layer):
     """
+
     This interface is used to construct a callable object of the ``GroupNorm`` class.
     For more details, refer to code examples.
     It implements the function of the Group Normalization Layer.
@@ -341,7 +342,7 @@ class GroupNorm(Layer):
         name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
-        - x: Tensor with shape: (batch, num_features, *).
+        - x: Tensor with shape: attr:`(batch, num_features, *)`.
         - output: The same shape as input x.
 
     Returns:
@@ -1047,6 +1048,7 @@ class BatchNorm3D(_BatchNormBase):
 
 class SyncBatchNorm(_BatchNormBase):
     r"""
+
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
     It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
     be used as a normalizer function for other operations, such as conv2d and fully connected 
@@ -1092,9 +1094,9 @@ class SyncBatchNorm(_BatchNormBase):
     - :math:`\beta` : trainable shift parameter vector 
 
     Note:
-        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
-        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
-        ``list`` to pack the model. 
+        If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the
+        evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of
+        :ref:`api_paddle_hub_list` to pack the model.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -1112,29 +1114,30 @@ class SyncBatchNorm(_BatchNormBase):
              have trainable bias parameter. Default: None.
 
     Shapes:
-        input: Tensor that the dimension from 2 to 5.
-        output: Tensor with the same shape as input.
+        - input: Tensor that the dimension from 2 to 5.
+        - output: Tensor with the same shape as input.
 
     Examples:
         .. code-block:: python
 
-          # required: gpu
+            # required: gpu
 
-          import paddle
-          import paddle.nn as nn
+            import paddle
+            import paddle.nn as nn
 
-          x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+            x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
 
-          if paddle.is_compiled_with_cuda():
-              sync_batch_norm = nn.SyncBatchNorm(2)
-              hidden1 = sync_batch_norm(x)
-              print(hidden1)
-              # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-              #        [[[[ 0.26824948,  1.09363246],
-              #           [ 0.26824948, -1.63013160]],
+            if paddle.is_compiled_with_cuda():
+                sync_batch_norm = nn.SyncBatchNorm(2)
+                hidden1 = sync_batch_norm(x)
+                print(hidden1)
+                # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+                #        [[[[ 0.26824948,  1.09363246],
+                #           [ 0.26824948, -1.63013160]],
+
+                #          [[ 0.80956620, -0.66528702],
+                #           [-1.27446556,  1.13018656]]]])
 
-              #          [[ 0.80956620, -0.66528702],
-              #           [-1.27446556,  1.13018656]]]])
     """
 
     def __init__(
@@ -1284,8 +1287,8 @@ class SyncBatchNorm(_BatchNormBase):
             The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
 
         Examples:
-
             .. code-block:: python
+
                 import paddle
                 import paddle.nn as nn
 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 8371bbeaa582d3cdb9340a594977a3b16294adeb..75580342b392c2c7e64800af02b325480fcace6b 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -224,6 +224,7 @@ class AvgPool2D(Layer):
 
 class AvgPool3D(Layer):
     """
+
     This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCDHW format, where N is batch size, C is the number of channels,
@@ -264,6 +265,7 @@ class AvgPool3D(Layer):
           The data type can be float32, float64.
         - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
           The data type is same as input x.
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 59d1389f09974372268d0c6015076d0d71081f6b..72bad0d44a8c38759d7ba2113a5cd991aeebe036 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -514,14 +514,17 @@ class QuantizedConv2D(Layer):
 
 class QuantizedConv2DTranspose(Layer):
     """
+
     The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
     The only difference is that its inputs are all fake quantized.
     
     Examples:
        .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
+
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
           conv = nn.Conv2DTranspose(4, 6, (3, 3))
           conv_quantized = QuantizedConv2DTranspose(conv)
@@ -531,6 +534,7 @@ class QuantizedConv2DTranspose(Layer):
           y_np = y_var.numpy()
           print(y_np.shape, y_quantized_np.shape)
           # (2, 6, 10, 10), (2, 6, 10, 10)
+
     """
 
     def __init__(self,
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 4e2dbafcabbc0b474c6a4b8fd83f7593032fb239..77a332cbdd997142258ef79b0ee4e2381cb00fd1 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1661,6 +1661,7 @@ class MultiplicativeDecay(LRScheduler):
 
 class OneCycleLR(LRScheduler):
     r"""
+
     Sets the learning rate according to the one cycle learning rate scheduler.
     The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
     from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
@@ -1674,22 +1675,25 @@ class OneCycleLR(LRScheduler):
     Also note that you should update learning rate each step.
 
     Args:
-        max_learning_rate (float): The maximum learning rate. It is a python float number.
-             Functionally, it defines the initial learning rate by ``divide_factor`` .
+        max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
         total_steps (int): Number of total training steps.
-        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
         end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
         phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
-        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
-            'linear' for linear annealing. Default: 'cos'.
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
         three_phase (bool, optional): Whether to use three phase.
+
             If ``True``:
+
                 1. The learning rate will first increase from initial learning rate to maximum learning rate.
                 2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
                 3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
+
             If ``False``:
+
                 1. The learning rate will increase to maximum learning rate.
                 2. Then it will directly decrease to minimum learning rate.
+
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
@@ -1741,6 +1745,7 @@ class OneCycleLR(LRScheduler):
                         },
                         fetch_list=loss.name)
                     scheduler.step()    # You should update learning rate each step
+
     """
 
     def __init__(
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index 656605f1bf2b7c5b6ee0f5c9e8bdc453910bf169..6a4de719a9c8ee97898ba28e66420c7f110a1df8 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -39,15 +39,15 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
             with shape `[..., seq_length]` or `[seq_length, ...]`.
         frame_length (int): Length of the frame and `0 < frame_length <= x.shape[axis]`.
         hop_length (int): Number of steps to advance between adjacent frames
-            and `0 < hop_length`. 
+            and `0 < hop_length`.
         axis (int, optional): Specify the axis to operate on the input Tensors. Its
             value should be 0(the first dimension) or -1(the last dimension). If not
-            specified, the last axis is used by default. 
+            specified, the last axis is used by default.
 
     Returns:
         The output frames tensor with shape `[..., frame_length, num_frames]` if `axis==-1`,
             otherwise `[num_frames, frame_length, ...]` where
-        
+
             `num_framse = 1 + (x.shape[axis] - frame_length) // hop_length`
 
     Examples:
@@ -56,7 +56,7 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
 
         import paddle
         from paddle.signal import frame
-        
+
         # 1D
         x = paddle.arange(8)
         y0 = frame(x, frame_length=4, hop_length=2, axis=-1)  # [4, 3]
@@ -124,7 +124,8 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
         if frame_length > x.shape[axis]:
             raise ValueError(
                 f'Attribute frame_length should be less equal than sequence length, '
-                f'but got ({frame_length}) > ({x.shape[axis]}).')
+                f'but got ({frame_length}) > ({x.shape[axis]}).'
+            )
 
     op_type = 'frame'
 
@@ -132,25 +133,33 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
         return _C_ops.frame(x, frame_length, hop_length, axis)
 
     if _in_legacy_dygraph():
-        attrs = ('frame_length', frame_length, 'hop_length', hop_length, 'axis',
-                 axis)
+        attrs = (
+            'frame_length',
+            frame_length,
+            'hop_length',
+            hop_length,
+            'axis',
+            axis,
+        )
         op = getattr(_legacy_C_ops, op_type)
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-            op_type)
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
+        )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type=op_type,
-                         inputs={'X': x},
-                         attrs={
-                             'frame_length': frame_length,
-                             'hop_length': hop_length,
-                             'axis': axis
-                         },
-                         outputs={'Out': out})
+        helper.append_op(
+            type=op_type,
+            inputs={'X': x},
+            attrs={
+                'frame_length': frame_length,
+                'hop_length': hop_length,
+                'axis': axis,
+            },
+            outputs={'Out': out},
+        )
     return out
 
 
@@ -163,10 +172,10 @@ def overlap_add(x, hop_length, axis=-1, name=None):
             with shape `[..., frame_length, num_frames]` or
             `[num_frames, frame_length ...]`.
         hop_length (int): Number of steps to advance between adjacent frames and
-            `0 < hop_length <= frame_length`. 
+            `0 < hop_length <= frame_length`.
         axis (int, optional): Specify the axis to operate on the input Tensors. Its
             value should be 0(the first dimension) or -1(the last dimension). If not
-            specified, the last axis is used by default. 
+            specified, the last axis is used by default.
 
     Returns:
         The output frames tensor with shape `[..., seq_length]` if `axis==-1`,
@@ -180,7 +189,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
 
         import paddle
         from paddle.signal import overlap_add
-        
+
         # 2D
         x0 = paddle.arange(16).reshape([8, 2])
         # [[0 , 1 ],
@@ -205,7 +214,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
         y0 = overlap_add(x0, hop_length=2, axis=-1)  # [2, 1, 10]
 
         x1 = paddle.arange(32).reshape([2, 8, 1, 2])
-        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2] 
+        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2]
     """
     if axis not in [0, -1]:
         raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')
@@ -225,32 +234,34 @@ def overlap_add(x, hop_length, axis=-1, name=None):
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-            op_type)
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
+        )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type=op_type,
-                         inputs={'X': x},
-                         attrs={
-                             'hop_length': hop_length,
-                             'axis': axis
-                         },
-                         outputs={'Out': out})
+        helper.append_op(
+            type=op_type,
+            inputs={'X': x},
+            attrs={'hop_length': hop_length, 'axis': axis},
+            outputs={'Out': out},
+        )
     return out
 
 
-def stft(x,
-         n_fft,
-         hop_length=None,
-         win_length=None,
-         window=None,
-         center=True,
-         pad_mode='reflect',
-         normalized=False,
-         onesided=True,
-         name=None):
+def stft(
+    x,
+    n_fft,
+    hop_length=None,
+    win_length=None,
+    window=None,
+    center=True,
+    pad_mode='reflect',
+    normalized=False,
+    onesided=True,
+    name=None,
+):
     r"""
+
     Short-time Fourier transform (STFT).
 
     The STFT computes the discrete Fourier transforms (DFT) of short overlapping
@@ -263,11 +274,14 @@ def stft(x,
     
     Where:
     - :math:`t`: The :math:`t`-th input window.
+
     - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
-        or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`. 
+      or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`.
+
     - :math:`N`: Value of `n_fft`.
-    - :math:`H`: Value of `hop_length`.  
-    
+
+    - :math:`H`: Value of `hop_length`.
+
     Args:
         x (Tensor): The input data which is a 1-dimensional or 2-dimensional Tensor with
             shape `[..., seq_length]`. It can be a real-valued or a complex Tensor.
@@ -292,10 +306,10 @@ def stft(x,
             to set this property. For more information, please refer to :ref:`api_guide_Name`.
     
     Returns:
-        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`(
-            real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
-            `onesided` is `False`)
-    
+        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`
+        (real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`
+        (`onesided` is `False`)
+
     Examples:
         .. code-block:: python
     
@@ -311,14 +325,17 @@ def stft(x,
             x = paddle.randn([8, 48000], dtype=paddle.float64) + \
                     paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
             y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
+
     """
-    check_variable_and_dtype(x, 'x',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'stft')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft'
+    )
 
     x_rank = len(x.shape)
-    assert x_rank in [1, 2], \
-        f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
+    assert x_rank in [
+        1,
+        2,
+    ], f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
 
     if x_rank == 1:  # (batch, seq_length)
         x = x.unsqueeze(0)
@@ -326,69 +343,77 @@ def stft(x,
     if hop_length is None:
         hop_length = int(n_fft // 4)
 
-    assert hop_length > 0, \
-        f'hop_length should be > 0, but got {hop_length}.'
+    assert hop_length > 0, f'hop_length should be > 0, but got {hop_length}.'
 
     if win_length is None:
         win_length = n_fft
 
     if _non_static_mode():
-        assert 0 < n_fft <= x.shape[-1], \
-            f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
+        assert (
+            0 < n_fft <= x.shape[-1]
+        ), f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
 
-    assert 0 < win_length <= n_fft, \
-        f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
+    assert (
+        0 < win_length <= n_fft
+    ), f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
 
     if window is not None:
-        assert len(window.shape) == 1 and len(window) == win_length, \
-            f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
+        assert (
+            len(window.shape) == 1 and len(window) == win_length
+        ), f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
     else:
-        window = paddle.ones(shape=(win_length, ), dtype=x.dtype)
+        window = paddle.ones(shape=(win_length,), dtype=x.dtype)
 
     if win_length < n_fft:
         pad_left = (n_fft - win_length) // 2
         pad_right = n_fft - win_length - pad_left
-        window = paddle.nn.functional.pad(window,
-                                          pad=[pad_left, pad_right],
-                                          mode='constant')
+        window = paddle.nn.functional.pad(
+            window, pad=[pad_left, pad_right], mode='constant'
+        )
 
     if center:
-        assert pad_mode in ['constant', 'reflect'], \
-            'pad_mode should be "reflect" or "constant", but got "{}".'.format(pad_mode)
+        assert pad_mode in [
+            'constant',
+            'reflect',
+        ], 'pad_mode should be "reflect" or "constant", but got "{}".'.format(
+            pad_mode
+        )
 
         pad_length = n_fft // 2
         # FIXME: Input `x` can be a complex tensor but pad does not supprt complex input.
-        x = paddle.nn.functional.pad(x.unsqueeze(-1),
-                                     pad=[pad_length, pad_length],
-                                     mode=pad_mode,
-                                     data_format="NLC").squeeze(-1)
+        x = paddle.nn.functional.pad(
+            x.unsqueeze(-1),
+            pad=[pad_length, pad_length],
+            mode=pad_mode,
+            data_format="NLC",
+        ).squeeze(-1)
 
     x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1)
     x_frames = x_frames.transpose(
-        perm=[0, 2,
-              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+        perm=[0, 2, 1]
+    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
     x_frames = paddle.multiply(x_frames, window)
 
     norm = 'ortho' if normalized else 'backward'
     if is_complex(x_frames):
-        assert not onesided, \
-            'onesided should be False when input or window is a complex Tensor.'
+        assert (
+            not onesided
+        ), 'onesided should be False when input or window is a complex Tensor.'
 
     if not is_complex(x):
-        out = fft_r2c(x=x_frames,
-                      n=None,
-                      axis=-1,
-                      norm=norm,
-                      forward=True,
-                      onesided=onesided,
-                      name=name)
+        out = fft_r2c(
+            x=x_frames,
+            n=None,
+            axis=-1,
+            norm=norm,
+            forward=True,
+            onesided=onesided,
+            name=name,
+        )
     else:
-        out = fft_c2c(x=x_frames,
-                      n=None,
-                      axis=-1,
-                      norm=norm,
-                      forward=True,
-                      name=name)
+        out = fft_c2c(
+            x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name
+        )
 
     out = out.transpose(perm=[0, 2, 1])  # (batch, n_fft, num_frames)
 
@@ -398,22 +423,24 @@ def stft(x,
     return out
 
 
-def istft(x,
-          n_fft,
-          hop_length=None,
-          win_length=None,
-          window=None,
-          center=True,
-          normalized=False,
-          onesided=True,
-          length=None,
-          return_complex=False,
-          name=None):
+def istft(
+    x,
+    n_fft,
+    hop_length=None,
+    win_length=None,
+    window=None,
+    center=True,
+    normalized=False,
+    onesided=True,
+    length=None,
+    return_complex=False,
+    name=None,
+):
     r"""
     Inverse short-time Fourier transform (ISTFT).
 
     Reconstruct time-domain signal from the giving complex input and window tensor when
-        nonzero overlap-add (NOLA) condition is met: 
+        nonzero overlap-add (NOLA) condition is met:
 
     .. math::
         \sum_{t = -\infty}^{\infty}%
@@ -432,7 +459,7 @@ def istft(x,
 
     Args:
         x (Tensor): The input data which is a 2-dimensional or 3-dimensional **complesx**
-            Tensor with shape `[..., n_fft, num_frames]`. 
+            Tensor with shape `[..., n_fft, num_frames]`.
         n_fft (int): The size of Fourier transform.
         hop_length (int, optional): Number of steps to advance between adjacent windows
             from time-domain signal and `0 < hop_length < win_length`. Default: `None`(
@@ -452,10 +479,10 @@ def istft(x,
             and `istft` will return a real-valued tensor when it is set to `True`.
             Default: `True`.
         length (int, optional): Specify the length of time-domain signal. Default: `None`(
-            treated as the whole length of signal). 
+            treated as the whole length of signal).
         return_complex (bool, optional): It means that whether the time-domain signal is
             real-valued. If `return_complex` is set to `True`, `onesided` should be set to
-            `False` cause the output is complex. 
+            `False` cause the output is complex.
         name (str, optional): The default value is None. Normally there is no need for user
             to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -484,8 +511,12 @@ def istft(x,
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')
 
     x_rank = len(x.shape)
-    assert x_rank in [2, 3], \
-        'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(x_rank)
+    assert x_rank in [
+        2,
+        3,
+    ], 'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(
+        x_rank
+    )
 
     if x_rank == 2:  # (batch, n_fft, n_frames)
         x = x.unsqueeze(0)
@@ -497,83 +528,107 @@ def istft(x,
         win_length = n_fft
 
     # Assure no gaps between frames.
-    assert 0 < hop_length <= win_length, \
-        'hop_length should be in (0, win_length({})], but got {}.'.format(win_length, hop_length)
-
-    assert 0 < win_length <= n_fft, \
-        'win_length should be in (0, n_fft({})], but got {}.'.format(n_fft, win_length)
+    assert (
+        0 < hop_length <= win_length
+    ), 'hop_length should be in (0, win_length({})], but got {}.'.format(
+        win_length, hop_length
+    )
+
+    assert (
+        0 < win_length <= n_fft
+    ), 'win_length should be in (0, n_fft({})], but got {}.'.format(
+        n_fft, win_length
+    )
 
     n_frames = x.shape[-1]
     fft_size = x.shape[-2]
 
     if _non_static_mode():
         if onesided:
-            assert (fft_size == n_fft // 2 + 1), \
-                'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
+            assert (
+                fft_size == n_fft // 2 + 1
+            ), 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(
+                n_fft // 2 + 1, fft_size
+            )
         else:
-            assert (fft_size == n_fft), \
-                'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
+            assert (
+                fft_size == n_fft
+            ), 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(
+                n_fft, fft_size
+            )
 
     if window is not None:
-        assert len(window.shape) == 1 and len(window) == win_length, \
-            'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
+        assert (
+            len(window.shape) == 1 and len(window) == win_length
+        ), 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(
+            win_length, window.shape
+        )
     else:
-        window_dtype = paddle.float32 if x.dtype in [
-            paddle.float32, paddle.complex64
-        ] else paddle.float64
-        window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
+        window_dtype = (
+            paddle.float32
+            if x.dtype in [paddle.float32, paddle.complex64]
+            else paddle.float64
+        )
+        window = paddle.ones(shape=(win_length,), dtype=window_dtype)
 
     if win_length < n_fft:
         pad_left = (n_fft - win_length) // 2
         pad_right = n_fft - win_length - pad_left
         # FIXME: Input `window` can be a complex tensor but pad does not supprt complex input.
-        window = paddle.nn.functional.pad(window,
-                                          pad=[pad_left, pad_right],
-                                          mode='constant')
+        window = paddle.nn.functional.pad(
+            window, pad=[pad_left, pad_right], mode='constant'
+        )
 
     x = x.transpose(
-        perm=[0, 2,
-              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+        perm=[0, 2, 1]
+    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
     norm = 'ortho' if normalized else 'backward'
 
     if return_complex:
-        assert not onesided, \
-            'onesided should be False when input(output of istft) or window is a complex Tensor.'
+        assert (
+            not onesided
+        ), 'onesided should be False when input(output of istft) or window is a complex Tensor.'
 
         out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
     else:
-        assert not is_complex(window), \
-            'Data type of window should not be complex when return_complex is False.'
+        assert not is_complex(
+            window
+        ), 'Data type of window should not be complex when return_complex is False.'
 
         if onesided is False:
-            x = x[:, :, :n_fft // 2 + 1]
+            x = x[:, :, : n_fft // 2 + 1]
         out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
 
     out = paddle.multiply(out, window).transpose(
-        perm=[0, 2, 1])  # (batch, n_fft, num_frames)
-    out = overlap_add(x=out, hop_length=hop_length,
-                      axis=-1)  # (batch, seq_length)
+        perm=[0, 2, 1]
+    )  # (batch, n_fft, num_frames)
+    out = overlap_add(
+        x=out, hop_length=hop_length, axis=-1
+    )  # (batch, seq_length)
 
     window_envelop = overlap_add(
         x=paddle.tile(
             x=paddle.multiply(window, window).unsqueeze(0),
-            repeat_times=[n_frames,
-                          1]).transpose(perm=[1, 0]),  # (n_fft, num_frames)
+            repeat_times=[n_frames, 1],
+        ).transpose(
+            perm=[1, 0]
+        ),  # (n_fft, num_frames)
         hop_length=hop_length,
-        axis=-1)  # (seq_length, )
+        axis=-1,
+    )  # (seq_length, )
 
     if length is None:
         if center:
-            out = out[:, (n_fft // 2):-(n_fft // 2)]
-            window_envelop = window_envelop[(n_fft // 2):-(n_fft // 2)]
+            out = out[:, (n_fft // 2) : -(n_fft // 2)]
+            window_envelop = window_envelop[(n_fft // 2) : -(n_fft // 2)]
     else:
         if center:
             start = n_fft // 2
         else:
             start = 0
 
-        out = out[:, start:start + length]
-        window_envelop = window_envelop[start:start + length]
+        out = out[:, start : start + length]
+        window_envelop = window_envelop[start : start + length]
 
     # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
     if _non_static_mode() and window_envelop.abs().min().item() < 1e-11:
diff --git a/python/paddle/sparse/nn/layer/activation.py b/python/paddle/sparse/nn/layer/activation.py
index 7ca3e8efcf4b0cb08ba58512fac7eb166a8b9d71..3e8e81ea0b11dcc5419b9d01b8592daf328aca7d 100644
--- a/python/paddle/sparse/nn/layer/activation.py
+++ b/python/paddle/sparse/nn/layer/activation.py
@@ -20,6 +20,7 @@ __all__ = []
 
 class ReLU(Layer):
     """
+
     Sparse ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     .. math::
@@ -44,6 +45,7 @@ class ReLU(Layer):
             relu = paddle.sparse.nn.ReLU()
             out = relu(sparse_x)
             # [0., 0., 1.]
+
     """
 
     def __init__(self, name=None):
@@ -59,14 +61,15 @@ class ReLU(Layer):
 
 
 class Softmax(Layer):
-    """
+    r"""
+
     Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     Note:
-        Only support axis=-1 for SparseCsrTensor, which is faster when read data 
+        Only support axis=-1 for SparseCsrTensor, which is faster when read data
         by row (axis=-1).
 
-    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j` 
+    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j`
     in the matrix, we have:
 
     .. math::
@@ -96,17 +99,17 @@ class Softmax(Layer):
             #  [0.         0.         0.         0.98275049]]
 
             csr = paddle.to_tensor(np_x).to_sparse_csr()
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
             #        values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372,
             #                0.98275049])
 
             softmax = paddle.sparse.nn.Softmax()
             out = softmax(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
             #        values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269,
             #                1.        ])
     """
@@ -126,6 +129,7 @@ class Softmax(Layer):
 
 class ReLU6(Layer):
     """
+
     Sparse ReLU6 Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     .. math::
@@ -149,6 +153,7 @@ class ReLU6(Layer):
             sparse_x = dense_x.to_sparse_coo(1)
             relu6 = paddle.sparse.nn.ReLU6()
             out = relu6(sparse_x)
+
     """
 
     def __init__(self, name=None):
@@ -164,8 +169,9 @@ class ReLU6(Layer):
 
 
 class LeakyReLU(Layer):
-    """
-    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. 
+    r"""
+
+    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     .. math::
 
@@ -196,6 +202,7 @@ class LeakyReLU(Layer):
             sparse_x = dense_x.to_sparse_coo(1)
             leaky_relu = paddle.sparse.nn.LeakyReLU(0.5)
             out = leaky_relu(sparse_x)
+
     """
 
     def __init__(self, negative_slope=0.01, name=None):
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 560d1ad49d1c690a0b6050dd530cb73147e33f2b..82a016ce64da7afd1ddc11ee20fe2893fbdc7bbb 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1180,7 +1180,8 @@ def triu(x, diagonal=0, name=None):
 
 def meshgrid(*args, **kwargs):
     """
-    Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
+
+    Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.
 
     Args:
         *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 55726831d2e359a9825574b12f020f9dc1836e5b..0ffd461c633c5cf44c55d2231f6722ebeb498bd2 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -22,9 +22,17 @@ from .math import multiply
 from .math import sum as paddle_sum
 from ..fluid.framework import _in_legacy_dygraph
 from paddle import _C_ops, _legacy_C_ops
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+from ..fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 import collections
 import string
 import opt_einsum
@@ -47,17 +55,18 @@ def parse_op_labels(labelstr, operand):
 
     Returns
     -------
-    the input operand's full label string in which all anonymous dimensions are 
-    labeled in dots. 
+    the input operand's full label string in which all anonymous dimensions are
+    labeled in dots.
     '''
     # Sanity checks
     for c in labelstr.replace('.', ''):
-        assert c.isalpha(), (
-            f"Invalid equation: {c} is not a valid label, which should be letters."
-        )
+        assert (
+            c.isalpha()
+        ), f"Invalid equation: {c} is not a valid label, which should be letters."
 
-    assert labelstr.replace('...', '', 1).find('.') == -1, (
-        f"Invalid equation: `.` is found outside of an ellipsis.")
+    assert (
+        labelstr.replace('...', '', 1).find('.') == -1
+    ), f"Invalid equation: `.` is found outside of an ellipsis."
 
     # Check shape. Note, in Paddle a tensor rank is always nonzero
     ndims = len(operand.shape)
@@ -65,8 +74,9 @@ def parse_op_labels(labelstr, operand):
 
     full_labelstr = labelstr.replace('...', '.' * (ndims - len(labelstr) + 3))
 
-    assert len(full_labelstr) == ndims, (
-        f"Invalid equation: the label string '{labelstr}' misses dimensions.")
+    assert (
+        len(full_labelstr) == ndims
+    ), f"Invalid equation: the label string '{labelstr}' misses dimensions."
 
     return full_labelstr
 
@@ -74,14 +84,14 @@ def parse_op_labels(labelstr, operand):
 def parse_labels(labelstr, operands):
     '''
     Parse label strings for all input operands.
-    
+
     Parameters
     ----------
     labelstr:
         The equation's label string
     operands:
         The input operands
-    
+
     Returns
     -------
     list of full label strings for all input operands
@@ -90,19 +100,21 @@ def parse_labels(labelstr, operands):
     nop_labels = labelstr.split(',')
     assert len(nop_labels) == len(operands), (
         f"Invalid equation: the number of operands is {len(operands)}, "
-        f"but found {len(nop_labels)} segments in the label equation.")
+        f"but found {len(nop_labels)} segments in the label equation."
+    )
 
     return list(map(parse_op_labels, nop_labels, operands))
 
 
 def validate_rhs(rhs, input_labels, n_bcast_dims):
     '''
-    Check whether the equation's right hand side is valid 
+    Check whether the equation's right hand side is valid
     '''
     # Sanity check.
     if n_bcast_dims > 0:
-        assert '...' in rhs, (
-            f"Invalid equation: missing ellipsis in output labels.")
+        assert (
+            '...' in rhs
+        ), f"Invalid equation: missing ellipsis in output labels."
 
     rhs = rhs.replace('...', '')
     rhs_set = set(rhs)
@@ -114,16 +126,18 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
     non_input_labels = rhs_set.difference(input_labels)
     assert not non_input_labels, (
         f"Invalid equation: "
-        f"output label {sorted(non_input_labels)} not used by any input.")
+        f"output label {sorted(non_input_labels)} not used by any input."
+    )
     # Verify that output labels are not duplicate
-    assert len(rhs) == len(rhs_set), (
-        f"Invalid equation: duplicate output labels are found.")
+    assert len(rhs) == len(
+        rhs_set
+    ), f"Invalid equation: duplicate output labels are found."
 
 
 def build_view(in_labels, out_labels):
     '''
-    Build an inverse map of dimension indices. Three conditions must hold for 
-    the result to be meaningful. 
+    Build an inverse map of dimension indices. Three conditions must hold for
+    the result to be meaningful.
     First, no duplicate letter labels in each label string.
     Second, the number of dots in dimout_labels >= that in in_labels.
     Third, dots are contiguous in each label string.
@@ -134,7 +148,7 @@ def build_view(in_labels, out_labels):
         The dimension labels to map to
     out_labels:
         The dimension labels to map from
-    
+
     Returns
     -------
     The inverse map from out_labels to in_labels. The length of the inverse map equals that of
@@ -159,8 +173,8 @@ def build_view(in_labels, out_labels):
         # fill the broadcast dimension indices from right to left.
         if s:
             for ax, dim in zip(
-                    range(start, end)[::-1],
-                    range(s.start(), s.end())[::-1]):
+                range(start, end)[::-1], range(s.start(), s.end())[::-1]
+            ):
                 inv_map[ax] = dim
 
     # Now work on non-broadcast dimensions
@@ -181,7 +195,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
     plus an index table that maps from the layout to the dimensions
     in each operand. In the global view, the dimensions are arranged
     such that output ones are put on the left and contraction ones
-    are put on the right.  
+    are put on the right.
 
     Parameters
     ----------
@@ -191,7 +205,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
         The equation right hand side
     n_bcast_dims:
         The maxium number of broadcast dimensions
-    
+
     Returns
     -------
     A tuple of g_labels, g_view, g_nout, g_count
@@ -219,7 +233,8 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
         g_labels_out = rhs.replace('...', '.' * n_bcast_dims)
     else:
         g_labels_out = '.' * n_bcast_dims + ''.join(
-            l for l, c in zip(labels, count) if c == 1)
+            l for l, c in zip(labels, count) if c == 1
+        )
 
     for i in range(len(count))[::-1]:
         if labels[i] in g_labels_out:
@@ -237,7 +252,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
 
 def build_global_shape(g_view, g_labels, op_shapes):
     '''
-    The global shape is the shape of all dimensions rearranged and broadcasting 
+    The global shape is the shape of all dimensions rearranged and broadcasting
     to the global view. It's a reference data structure for einsum planning.
 
     Parameters
@@ -267,12 +282,14 @@ def build_global_shape(g_view, g_labels, op_shapes):
 
     assert not non_bcastable, (
         f"Invalid operands: label {g_labels[non_bcastable[0]]} "
-        f"corresponds to non-broadcastable dimensions.")
+        f"corresponds to non-broadcastable dimensions."
+    )
 
     g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]
 
-    g_masks = [[s > 1 or s == -1 for s in view_shape]
-               for view_shape in view_shapes]
+    g_masks = [
+        [s > 1 or s == -1 for s in view_shape] for view_shape in view_shapes
+    ]
 
     return g_shape, g_masks
 
@@ -287,18 +304,19 @@ def has_duplicated_labels(labels):
 
 def diagonalize(labels, operand):
     '''
-    Merges dimensions with duplicate labels. 
-    
+    Merges dimensions with duplicate labels.
+
     For those dimensions with duplicate labels, merge them into one dimension
     which represents the diagonal elements. This requires the dimensions with
     duplicate labels are equal sized.
-    
+
     Examples
-    -------- 
+    --------
     'ijj...i' would be merged into 'ij...'
     '''
-    assert not has_duplicated_labels(labels), (
-        f'Duplicate labels are not supported.')
+    assert not has_duplicated_labels(
+        labels
+    ), f'Duplicate labels are not supported.'
 
     return labels, operand
 
@@ -358,12 +376,21 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
         plan.add_step(step)
 
     # Check if conditions hold for turnning the operation into a matmul
-    if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate(
-        (op1_vshape, op2_vshape)):
-        op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1])
-                                           ] + [np.prod(op1_vshape[K])]
-        op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2])
-                                           ] + [np.prod(op2_vshape[K])]
+    if (
+        j1 + j2 > 0
+        and k > 0
+        and -1 not in np.concatenate((op1_vshape, op2_vshape))
+    ):
+        op1_shape = (
+            list(op1_vshape[I])
+            + [np.prod(op1_vshape[J1])]
+            + [np.prod(op1_vshape[K])]
+        )
+        op2_shape = (
+            list(op2_vshape[I])
+            + [np.prod(op2_vshape[J2])]
+            + [np.prod(op2_vshape[K])]
+        )
 
         # Merge J dims and K dims by reshaping
         step = reshape, [var1], var1, op1_shape
@@ -412,15 +439,22 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
             step = squeeze, [var2], var2, [-1, -2]
             plan.add_step(step)
         elif j1 + j2 == 0 and not -1 in np.concatenate(
-            (op1_vshape[K], op2_vshape[K])):
+            (op1_vshape[K], op2_vshape[K])
+        ):
             assert all(op1_vshape[K] == op2_vshape[K])
-            step = reshape, [
-                var1
-            ], var1, list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])]
+            step = (
+                reshape,
+                [var1],
+                var1,
+                list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])],
+            )
             plan.add_step(step)
-            step = reshape, [
-                var2
-            ], var2, list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])]
+            step = (
+                reshape,
+                [var2],
+                var2,
+                list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])],
+            )
             plan.add_step(step)
             step = matmul, [var1, var2], var2, False, True
             plan.add_step(step)
@@ -449,8 +483,9 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
     g_view[op2] = list(op2_view)
 
 
-def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
-                   n_bcast):
+def plan_summation(
+    plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast
+):
     '''
     Plan various kinds of summation
     '''
@@ -464,8 +499,9 @@ def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
 
     I, K, J1, J2 = list(range(n_bcast)), [], [], []
 
-    for ax, dim1, dim2 in zip(range(n_bcast, ndim), op1_view[n_bcast:],
-                              op2_view[n_bcast:]):
+    for ax, dim1, dim2 in zip(
+        range(n_bcast, ndim), op1_view[n_bcast:], op2_view[n_bcast:]
+    ):
 
         if (dim1 != -1) != (dim2 != -1):
             if dim1 != -1:
@@ -531,7 +567,6 @@ def plan_broadcast(plan, operands, nop_axes):
 
 
 class Plan:
-
     def __init__(self):
         self.env = {}
         self.steps = []
@@ -635,8 +670,9 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
             # op1 is a one element tensor.
             plan_scalar_prod(plan, i - 1, i)
         else:
-            plan_summation(plan, g_view, i - 1, i, g_supports, g_shape, g_count,
-                           n_bcast)
+            plan_summation(
+                plan, g_view, i - 1, i, g_supports, g_shape, g_count, n_bcast
+            )
 
     # for ax, dim in enumerate(g_view[nop-1][:nout]):
     #     assert dim == ax
@@ -678,7 +714,9 @@ def preprocess(equation, *operands):
     """
     equation = equation.replace(" ", "")
     nop = len(operands)
-    assert nop > 0, "Required at least one operand in Einsum API, but received %s " % nop
+    assert nop > 0, (
+        "Required at least one operand in Einsum API, but received %s " % nop
+    )
 
     # Part the equation to left hand side and right hand side
     lhs, *rhs = equation.lower().split('->')
@@ -692,37 +730,43 @@ def preprocess(equation, *operands):
 
     assert len(lhs.split(',')) == len(operands), (
         f"Invalid equation: the number of operands is {len(operands)}, "
-        f"but found {len(lhs.split(','))} segments in the label equation.")
+        f"but found {len(lhs.split(','))} segments in the label equation."
+    )
 
-    assert not ('...' in lhs and '...' not in rhs
-                ), f'Invalid equation: missing ellipsis in output labels.'
+    assert not (
+        '...' in lhs and '...' not in rhs
+    ), f'Invalid equation: missing ellipsis in output labels.'
 
-    assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) >
-                0), f'Duplicate labels are not supported.'
+    assert not (
+        len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
+    ), f'Duplicate labels are not supported.'
 
     assert not has_duplicated_labels(
-        rhs), f'Invalid equation: duplicate output labels are found.'
+        rhs
+    ), f'Invalid equation: duplicate output labels are found.'
 
     return lhs, rhs, labels
 
 
 def parse_fake_shape(equation, operands, labels):
-    """ 
+    """
+
     this shape is just used for operands planning. may differ with the original shape.
-    for example: 
+    for example:
     ... is replaced by 1
     -1  is replaced by 1
     Results
     -------
     list of shape
+
     """
     shaped = collections.namedtuple('shaped', ['shape'])
 
     def fake_shape(label, op):
-        assert len(op.shape) == len(
-            label
-        ), "length of shape and length of label must be the same, but received %d != %d" % (
-            len(op.shape), len(label))
+        assert len(op.shape) == len(label), (
+            "length of shape and length of label must be the same, but received %d != %d"
+            % (len(op.shape), len(label))
+        )
         fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.']
         fakes = list(map(abs, fakes))  # make -1 -> 1
         if '.' in label:
@@ -734,7 +778,6 @@ def parse_fake_shape(equation, operands, labels):
 
 
 def rhs_inference(lhs):
-
     def is_free(key):
         return cnt.get(key) == 1 and key not in ['.', ',']
 
@@ -745,7 +788,7 @@ def rhs_inference(lhs):
 
 
 def gen_equation_for_opteinsum(lhs, rhs):
-    """ 
+    """
     1. gen rhs if rhs is None
     2. '...' -> 'A'
     """
@@ -753,7 +796,8 @@ def gen_equation_for_opteinsum(lhs, rhs):
     def get_used_label(counter):
         used = set(counter.elements())
         for c in string.ascii_lowercase:
-            if c not in used: return c
+            if c not in used:
+                return c
         raise ValueError(
             "You have used all `a` - `z`, there can't find a unused for einsum optimization"
         )
@@ -768,7 +812,7 @@ def gen_equation_for_opteinsum(lhs, rhs):
 
 
 def einsum_v2(equation, *operands):
-    """ 
+    """
     einsum v2 implementation.
     1. Implement C++ EinsumOp.
     2. V2 create the EinsumOp to calculate, so just a little verifty work in python.
@@ -786,20 +830,21 @@ def einsum_v2(equation, *operands):
     var_list = list(operands)
     for path in cons:
         (a, b), _, eq, *__ = path
-        assert a > b, "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
+        assert (
+            a > b
+        ), "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
         var_s = [var_list.pop(a), var_list.pop(b)]
         eq = eq.replace(broadcast_label, "...")
         var_list.append(gen_einsum_op(eq, *var_s))
-    assert len(
-        var_list
-    ) == 1, "There must be one elements in list, but received %d." % len(
-        var_list)
+    assert (
+        len(var_list) == 1
+    ), "There must be one elements in list, but received %d." % len(var_list)
     return var_list[0]
 
 
 def gen_einsum_op(equation, *operands):
-    """ 
-    EinsumOp Python Interface: 
+    """
+    EinsumOp Python Interface:
     """
     assert len(operands) <= 2, "Only support two operands in EinsumOp."
     if in_dygraph_mode():
@@ -807,8 +852,9 @@ def gen_einsum_op(equation, *operands):
 
     if _in_legacy_dygraph():
         # dygraph
-        return _legacy_C_ops.einsum(operands, len(operands), len(operands),
-                                    'equation', equation)[0]
+        return _legacy_C_ops.einsum(
+            operands, len(operands), len(operands), 'equation', equation
+        )[0]
 
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
@@ -825,19 +871,18 @@ def gen_einsum_op(equation, *operands):
         helper.create_variable_for_type_inference(dtype=operands[0].dtype)
         for i in range(len(operands))
     ]
-    helper.append_op(type='einsum',
-                     inputs={'Operands': operands},
-                     outputs={
-                         'Out': out,
-                         "InnerCache": caches,
-                         "XShape": xshape
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='einsum',
+        inputs={'Operands': operands},
+        outputs={'Out': out, "InnerCache": caches, "XShape": xshape},
+        attrs=attrs,
+    )
     return out
 
 
 def einsum(equation, *operands):
     r"""
+
     einsum(equation, *operands)
 
     The current version of this API should be used in dygraph only mode.
@@ -862,39 +907,39 @@ def einsum(equation, *operands):
         - for many operads
             - broadcasting multiply
             - chained matrix multiply
-    
+
     **The summation notation**
 
         - The tensor dimensions are labeled using uncased English letters. E.g., `ijk`
-        relates to a three dimensional tensor whose dimensions are labeled i, j, and k.
+          relates to a three dimensional tensor whose dimensions are labeled i, j, and k.
         - The equation is `,` separated into terms, each being a distinct input's
-        dimension label string.
+          dimension label string.
         - Ellipsis `...` enables broadcasting by automatically converting the unlabeled
-        dimensions into broadcasting dimensions. 
+          dimensions into broadcasting dimensions.
         - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
-        dimensions will be reduced and removed in the output.
-        - Output labels can be explicitly specified on the right hand side of `->` or omitted.
-        In the latter case, the output labels will be inferred from the input labels.
+          dimensions will be reduced and removed in the output.
+        - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
             - Inference of output labels
                 - Broadcasting label `...`, if present, is put on the leftmost position.
                 - Free labels are reordered alphabetically and put after `...`.
             - On explicit output labels
                 - If broadcasting is enabled, then `...` must be present.
                 - The output labels can be an empty, an indication to output as a scalar
-                the sum over the original output.
+                  the sum over the original output.
                 - Non-input labels are invalid.
                 - Duplicate labels are invalid.
-                - For any dummmy label which is present for the output, it's promoted to
-                a free label.
+                - For any dummy label which is present for the output, it's promoted to
+                  a free label.
                 - For any free label which is not present for the output, it's lowered to
-                a dummy label.
+                  a dummy label.
+
         - Examples
             - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
-            string is '...ik'
-            - 'ij -> i', where i is a free label and j is a dummy label. 
+              string is '...ik'
+            - 'ij -> i', where i is a free label and j is a dummy label.
             - '...ij, ...jk -> ...ijk', where i, j and k are all free labels.
             - '...ij, ...jk -> ij', an invalid equation since `...` is not present for
-            the output.
+              the output.
 
     **The summation rule**
 
@@ -902,15 +947,15 @@ def einsum(equation, *operands):
     may vary significantly due to implementation specific optimization.
 
         - Step 1: preparation for broadcasting, that is, transposing and unsqueezing
-        the input operands to have each resulting dimension identically labeled across
-        all the input operands.
+          the input operands to have each resulting dimension identically labeled across
+          all the input operands.
         - Step 2: broadcasting multiply all the resulting operands from step 1.
         - Step 3: reducing dummy labeled dimensions.
         - Step 4: transposing the result tensor to match the output labels.
 
     **On trace and diagonal**
 
-    The trace and diagonal are planned yet unimplemented features. 
+    The trace and diagonal are planned yet unimplemented features.
 
     Args:
         equation (`str`):
@@ -918,82 +963,84 @@ def einsum(equation, *operands):
         operands (`list|Tensor`):
             The input tensors over which to compute the Einstein summation. The number of
             operands should equal the number of input terms in the equation.
-    
+
     Returns:
-        result (`Tensor`): the result tensor.
-    
+        result (`Tensor`), the result tensor.
+
     Examples:
         .. code-block:: python
 
-        import paddle
-        paddle.seed(102)
-        x = paddle.rand([4])
-        y = paddle.rand([5])
-
-        # sum
-        print(paddle.einsum('i->', x))
-        # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   1.95791852)
-
-        # dot
-        print(paddle.einsum('i,i->', x, x))
-        # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [1.45936954])
-        
-        # outer
-        print(paddle.einsum("i,j->ij", x, y))
-        # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194],
-        #    [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
-        #    [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
-        #    [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
-        
-        A = paddle.rand([2, 3, 2])
-        B = paddle.rand([2, 2, 3])
-        
-        # transpose
-        print(paddle.einsum('ijk->kji', A))
-        #  Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.95649719, 0.49684682],
-        #     [0.80071914, 0.46258664],
-        #     [0.49814570, 0.33383518]],
-        #
-        #    [[0.07637714, 0.29374704],
-        #     [0.51470858, 0.51907635],
-        #     [0.99066722, 0.55802226]]])
-        
-        # batch matrix multiplication
-        print(paddle.einsum('ijk, ikl->ijl', A,B))
-        # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.32172769, 0.50617385, 0.41394392],
-        #     [0.51736701, 0.49921003, 0.38730967],
-        #     [0.69078457, 0.42282537, 0.30161136]],
-        #
-        #    [[0.32043904, 0.18164253, 0.27810261],
-        #     [0.50226176, 0.24512935, 0.39881429],
-        #     [0.51476848, 0.23367381, 0.39229113]]])
-        
-        # Ellipsis transpose
-        print(paddle.einsum('...jk->...kj', A))
-        # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.95649719, 0.80071914, 0.49814570],
-        #     [0.07637714, 0.51470858, 0.99066722]],
-        #
-        #    [[0.49684682, 0.46258664, 0.33383518],
-        #     [0.29374704, 0.51907635, 0.55802226]]])
-        
-        # Ellipsis batch matrix multiplication
-        print(paddle.einsum('...jk, ...kl->...jl', A,B))
-        # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.32172769, 0.50617385, 0.41394392],
-        #     [0.51736701, 0.49921003, 0.38730967],
-        #     [0.69078457, 0.42282537, 0.30161136]],
-        #
-        #    [[0.32043904, 0.18164253, 0.27810261],
-        #     [0.50226176, 0.24512935, 0.39881429],
-        #     [0.51476848, 0.23367381, 0.39229113]]])
+            import paddle
+            paddle.seed(102)
+            x = paddle.rand([4])
+            y = paddle.rand([5])
+
+            # sum
+            print(paddle.einsum('i->', x))
+            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   1.95791852)
+
+            # dot
+            print(paddle.einsum('i,i->', x, x))
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [1.45936954])
+
+            # outer
+            print(paddle.einsum("i,j->ij", x, y))
+            # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194],
+            #    [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
+            #    [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
+            #    [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
+
+            A = paddle.rand([2, 3, 2])
+            B = paddle.rand([2, 2, 3])
+
+            # transpose
+            print(paddle.einsum('ijk->kji', A))
+            #  Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.95649719, 0.49684682],
+            #     [0.80071914, 0.46258664],
+            #     [0.49814570, 0.33383518]],
+            #
+            #    [[0.07637714, 0.29374704],
+            #     [0.51470858, 0.51907635],
+            #     [0.99066722, 0.55802226]]])
+
+            # batch matrix multiplication
+            print(paddle.einsum('ijk, ikl->ijl', A,B))
+            # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.32172769, 0.50617385, 0.41394392],
+            #     [0.51736701, 0.49921003, 0.38730967],
+            #     [0.69078457, 0.42282537, 0.30161136]],
+            #
+            #    [[0.32043904, 0.18164253, 0.27810261],
+            #     [0.50226176, 0.24512935, 0.39881429],
+            #     [0.51476848, 0.23367381, 0.39229113]]])
+
+            # Ellipsis transpose
+            print(paddle.einsum('...jk->...kj', A))
+            # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.95649719, 0.80071914, 0.49814570],
+            #     [0.07637714, 0.51470858, 0.99066722]],
+            #
+            #    [[0.49684682, 0.46258664, 0.33383518],
+            #     [0.29374704, 0.51907635, 0.55802226]]])
+
+            # Ellipsis batch matrix multiplication
+            print(paddle.einsum('...jk, ...kl->...jl', A,B))
+            # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.32172769, 0.50617385, 0.41394392],
+            #     [0.51736701, 0.49921003, 0.38730967],
+            #     [0.69078457, 0.42282537, 0.30161136]],
+            #
+            #    [[0.32043904, 0.18164253, 0.27810261],
+            #     [0.50226176, 0.24512935, 0.39881429],
+            #     [0.51476848, 0.23367381, 0.39229113]]])
+
     """
     import os
+
     if int(os.environ.get('FLAGS_new_einsum', "1")):
         return einsum_v2(equation, *operands)
 
@@ -1039,9 +1086,11 @@ def einsum(equation, *operands):
     #   Counting how many non-trivial dimensions remain for each ax
 
     g_labels, g_view, g_nout, g_count = build_global_view(
-        nop_labels, rhs, n_bcast_dims)
-    g_shape, g_supports = build_global_shape(g_view, g_labels,
-                                             [op.shape for op in operands])
+        nop_labels, rhs, n_bcast_dims
+    )
+    g_shape, g_supports = build_global_shape(
+        g_view, g_labels, [op.shape for op in operands]
+    )
 
     # Now we're ready to build up an execution plan
     args = operands, g_view, g_shape, g_supports, g_count, n_bcast_dims
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index ace9c8e98d6df92ee0fbd8feecd1d74c6a0d1612..132129ae628bc212c0e0b77fbb41d11cb052e285 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1912,12 +1912,15 @@ def mv(x, vec, name=None):
 
 def det(x, name=None):
     """
+
     Calculates determinant value of a square matrix or batches of square matrices.
 
     Args:
-        x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the
+        x (Tensor): the input matrix of size `(n, n)` or the
             batch of matrices of size `(*, n, n)` where `*` is one or more
             batch dimensions.
+        name(str, optional): Name of the output. Default is None. It's used
+            to print debug info for developers. Details: :ref:`api_guide_Name`
 
     Returns:
         Tensor, the determinant value of a square matrix or batches of square matrices.
@@ -1968,18 +1971,20 @@ def det(x, name=None):
 
 def slogdet(x, name=None):
     """
+
     Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
-    The determinant can be computed with ``sign * exp(logabsdet)
+    The determinant can be computed with ``sign * exp`` (logabsdet)
 
     Supports input of float, double
 
     Note that for matrices that have zero determinant, this returns ``(0, -inf)``
+
     Args:
         x (Tensor): the batch of matrices of size :math:`(*, n, n)`
             where math:`*` is one or more batch dimensions.
 
     Returns:
-        y (Tensor): A tensor containing the sign of the determinant and the natural logarithm
+        y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
         of the absolute value of determinant, respectively.
 
     Examples:
@@ -2097,6 +2102,7 @@ def svd(x, full_matrices=False, name=None):
 
 def matrix_power(x, n, name=None):
     r"""
+
     Computes the n-th power of a square matrix or a batch of square matrices.
 
     Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
@@ -2122,8 +2128,8 @@ def matrix_power(x, n, name=None):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its
-            data type should be the same as that of `x`.
+        - Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its
+          data type should be the same as that of `x`.
 
     Examples:
         .. code-block:: python
@@ -3058,8 +3064,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
 def solve(x, y, name=None):
     r"""
+
     Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
-    Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
+    Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
     a vector/matrix or a batch of vectors/matrices, the equation should be:
 
     .. math::
@@ -3068,9 +3075,9 @@ def solve(x, y, name=None):
     Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.
 
     Args:
-        x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
+        x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or
+        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
             more batch dimensions. Its data type should be float32 or float64.
         name(str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index eb7fe5263b14594dc170ed03439e6335a1d7eba5..1af56ee90ea745746bf0b4f65db30328ad50c26f 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -223,7 +223,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 
 
 def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
-    """
+    r"""
+
     stanh activation.
 
     .. math::
@@ -234,8 +235,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         scale_a (float, optional): The scale factor a of the input. Default is 0.67.
         scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 850e9988ee139c3101c6e21bb551f7360ee3e4de..9adda41eebfa92c6b53759743d2e656dfdd8831f 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -1301,15 +1301,17 @@ def distribute_fpn_proposals(
     name=None,
 ):
     r"""
-        In Feature Pyramid Networks (FPN) models, it is needed to distribute
+
+    In Feature Pyramid Networks (FPN) models, it is needed to distribute
     all proposals into different FPN level, with respect to scale of the proposals,
     the referring scale and the referring level. Besides, to restore the order of
     proposals, we return an array which indicates the original index of rois
     in current proposals. To compute FPN level for each roi, the formula is given as follows:
 
     .. math::
-        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
-        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
+        level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)
+
     where BBoxArea is a function to compute the area of each roi.
 
     Args:
@@ -1333,13 +1335,13 @@ def distribute_fpn_proposals(
             None by default.
 
     Returns:
-        multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
-            and data type is same as `fpn_rois` . The length is max_level-min_level+1.
-        restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
-            , where N is the number of total rois. The data type is int32.
-        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is
-            the RoIs' number in each image on the corresponding level. The shape
-            is [B] and data type of int32, where B is the number of images.
+        - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
+          and data type is same as `fpn_rois` . The length is max_level-min_level+1.
+        - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
+          , where N is the number of total rois. The data type is int32.
+        - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
+          the RoIs' number in each image on the corresponding level. The shape
+          is [B] and data type of int32, where B is the number of images.
 
     Examples:
         .. code-block:: python
@@ -1356,6 +1358,7 @@ def distribute_fpn_proposals(
                 refer_level=4,
                 refer_scale=224,
                 rois_num=rois_num)
+
     """
     num_lvl = max_level - min_level + 1
 
@@ -2441,6 +2444,7 @@ def matrix_nms(
     name=None,
 ):
     """
+
     This operator does matrix non maximum suppression (NMS).
     First selects a subset of candidate bounding boxes that have higher scores
     than score_threshold (if provided), then the top k candidate is selected if
@@ -2448,6 +2452,7 @@ def matrix_nms(
     decayed according to the Matrix NMS scheme.
     Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
     per image if keep_top_k is larger than -1.
+
     Args:
         bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
                            predicted locations of M bounding bboxes,
@@ -2471,29 +2476,32 @@ def matrix_nms(
                          on score_threshold.
         keep_top_k (int): Number of total bboxes to be kept per image after NMS
                           step. -1 means keeping all bboxes after NMS step.
-        use_gaussian (bool): Use Gaussian as the decay function. Default: False
-        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
-        background_label (int): The index of background label, the background
+        use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
+        gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
+        background_label (int, optional): The index of background label, the background
                                 label will be ignored. If set to -1, then all
                                 categories will be considered. Default: 0
-        normalized (bool): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
-        return_rois_num(bool): whether return rois_num. Default: True
-        name(str): Name of the matrix nms op. Default: None.
+        normalized (bool, optional): Whether detections are normalized. Default: True
+        return_index(bool, optional): Whether return selected index. Default: False
+        return_rois_num(bool, optional): whether return rois_num. Default: True
+        name(str, optional): Name of the matrix nms op. Default: None.
     Returns:
-        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
-        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
-        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
-             detection results.
-             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
-        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
-            selected indices, which are absolute values cross batches.
-        rois_num (Tensor): A 1-D Tensor with shape [N] containing
-            the number of detected boxes in each image.
+        - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
+          otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
+        - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
+          detection results.
+          Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
+        - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
+          selected indices, which are absolute values cross batches.
+        - rois_num (Tensor), A 1-D Tensor with shape [N] containing
+          the number of detected boxes in each image.
+
     Examples:
         .. code-block:: python
+
             import paddle
             from paddle.vision.ops import matrix_nms
+
             boxes = paddle.rand([4, 1, 4])
             boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
             boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
@@ -2501,6 +2509,7 @@ def matrix_nms(
             out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
                                  score_threshold=0.5, post_threshold=0.1,
                                  nms_top_k=400, keep_top_k=200, normalized=False)
+
     """
     check_variable_and_dtype(
         bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'