bump black to 2023 style (#54523)

44e0393c · Nyakku Shigure · GitHub · e73ddd6c · 44e0393c · 44e0393c
452 changed file
--- a/.cmake-format.py
+++ b/.cmake-format.py
@@ -16,7 +16,6 @@
 # Options affecting formatting.
 # -----------------------------
 with section("format"):
-
    # How wide to allow formatted cmake files
    line_width = 80


--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,7 +53,7 @@ repos:
            )$
 # For Python files
 -   repo: https://github.com/psf/black.git
-    rev: 22.8.0
+    rev: 23.3.0
    hooks:
    -   id: black
        files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$

--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -547,7 +547,6 @@ def GenerateCoreOpInfoDeclaration():


 def GenerateCoreOpInfoDefinition():
-
    op_args_info_list = []
    for op_name, arg_list in core_ops_args_info.items():
        arg_str = ",".join(["\"" + v + "\"" for v in arg_list])
@@ -803,7 +802,6 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
            self.backward_returns_list = backward_returns_list_new

    def CollectForwardInfoFromBackwardContents(self):
-
        backward_forward_str = self.backward_forward_str

        (
@@ -1910,7 +1908,6 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
                    self.grad_api_contents["backward_op"] in prim_white_list
                    or is_invoke_forward_api
                ):
-
                    next_grad_node_creation_str = f"""
 if (!paddle::prim::PrimCommonUtils::IsEagerPrimEnabled()) {{
    if(trace_backward) {{
@@ -2274,7 +2271,6 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
      egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
    }}"""
            if IsPlainTensorType(ttype):
-
                if (
                    backward_inplace_map
                    and name in backward_inplace_map.values()

--- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -604,7 +604,6 @@ def GenerateCoreOpsInfoMap():


 def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
-
    (
        core_ops_infos_definition,
        core_ops_infos_registry,

--- a/paddle/fluid/operators/generator/generate_op.py
+++ b/paddle/fluid/operators/generator/generate_op.py
@@ -500,7 +500,7 @@ def parse_get_expected_kerneltype(
            fw_name = op_comp_map['op'].split('(')[0].strip()
            # deal the last underline of function name in op_comp_map['get_expected_kernel_type']
            new_get_expected_kernel_type_func_map = {}
-            for (key, value) in op_comp_map['get_expected_kernel_type'].items():
+            for key, value in op_comp_map['get_expected_kernel_type'].items():
                new_get_expected_kernel_type_func_map[
                    delete_last_underline(key)
                ] = value

--- a/paddle/fluid/operators/generator/parse_utils.py
+++ b/paddle/fluid/operators/generator/parse_utils.py
@@ -615,7 +615,7 @@ def cross_validate(ops):
                assert len(fw_call["inputs"]) <= len(
                    fw_op["inputs"]
                ), f"{name}: forward call has more inputs than the op "
-                for (input, input_) in zip(fw_call["inputs"], fw_op["inputs"]):
+                for input, input_ in zip(fw_call["inputs"], fw_op["inputs"]):
                    assert (
                        input["typename"] == input_["typename"]
                    ), f"type mismatch in {name} and {fw_name}"
@@ -623,7 +623,7 @@ def cross_validate(ops):
                assert len(fw_call["attrs"]) <= len(
                    fw_op["attrs"]
                ), f"{name}: forward call has more attrs than the op "
-                for (attr, attr_) in zip(fw_call["attrs"], fw_op["attrs"]):
+                for attr, attr_ in zip(fw_call["attrs"], fw_op["attrs"]):
                    if attr["typename"] == "Scalar":
                        # special case for Scalar, fw_call can omit the type
                        assert re.match(
@@ -637,7 +637,7 @@ def cross_validate(ops):
                assert len(fw_call["outputs"]) == len(
                    fw_op["outputs"]
                ), f"{name}: forward call has more outputs than the op "
-                for (output, output_) in zip(
+                for output, output_ in zip(
                    fw_call["outputs"], fw_op["outputs"]
                ):
                    assert (

--- a/paddle/phi/api/yaml/generator/backward_api_gen.py
+++ b/paddle/phi/api/yaml/generator/backward_api_gen.py
@@ -316,7 +316,6 @@ def generate_backward_api(
    header_file_path,
    source_file_path,
 ):
-
    bw_apis = []
    for each_api_yaml in backward_yaml_path:
        with open(each_api_yaml, 'r') as f:

--- a/paddle/phi/api/yaml/generator/intermediate_api_gen.py
+++ b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
@@ -92,7 +92,6 @@ def generate_intermediate_api(
    dygraph_header_file_path,
    dygraph_source_file_path,
 ):
-
    dygraph_header_file = open(dygraph_header_file_path, 'w')
    dygraph_source_file = open(dygraph_source_file_path, 'w')


--- a/paddle/phi/api/yaml/generator/sparse_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py
@@ -351,7 +351,6 @@ namespace sparse {


 def generate_api(api_yaml_path, header_file_path, source_file_path):
-
    with open(api_yaml_path, 'r') as f:
        apis = yaml.load(f, Loader=yaml.FullLoader)
    header_file = open(header_file_path, 'w')

--- a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
@@ -158,7 +158,6 @@ namespace sparse {


 def generate_api(api_yaml_path, header_file_path, source_file_path):
-
    with open(api_yaml_path, 'r') as f:
        apis = yaml.load(f, Loader=yaml.FullLoader)
    header_file = open(header_file_path, 'w')

--- a/paddle/phi/api/yaml/generator/strings_api_gen.py
+++ b/paddle/phi/api/yaml/generator/strings_api_gen.py
@@ -362,7 +362,6 @@ namespace strings {


 def generate_api(api_yaml_path, header_file_path, source_file_path):
-
    with open(api_yaml_path, 'r') as f:
        apis = yaml.load(f, Loader=yaml.FullLoader)
    header_file = open(header_file_path, 'w')

--- a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_generator.py
+++ b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_generator.py
@@ -68,7 +68,6 @@ def CreateGatherGemmScatterOperator(
        for tile_description in tile_descriptions:
            for alignment in alignment_constraints:
                for complex_transform in complex_transforms:
-
                    alignment_c = min(8, alignment)

                    A = TensorDescription(
@@ -98,7 +97,6 @@ def CreateGatherGemmScatterOperator(


 def GenerateSM80_TensorOp_16816(manifest, cuda_version, debug=False):
-
    if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
        return

@@ -211,7 +209,6 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version, debug=False):

        # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
        if math_inst.element_a != math_inst.element_accumulator:
-
            data_type_mixed = [
                math_inst.element_a,
                math_inst.element_b,
@@ -225,7 +222,6 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version, debug=False):


 def GenerateSM80_TensorOp_1688(manifest, cuda_version, debug=False):
-
    if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
        return

@@ -341,7 +337,6 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version, debug=False):


 def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version, debug=False):
-
    if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
        return

@@ -443,7 +438,6 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version, debug=False):
 def GenerateSM80_TensorOp_1688_fast_fp32_math(
    manifest, cuda_version, debug=False
 ):
-
    if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
        return

@@ -525,7 +519,6 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(


 def GenerateSM75_TensorOp_1688(manifest, cuda_version, debug=False):
-
    if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
        return

@@ -649,7 +642,6 @@ class KernelCfg:


 if __name__ == "__main__":
-
    args = KernelCfg(
        architectures='80',
        build_dir=sys.argv[2],

--- a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_manifest.py
+++ b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_manifest.py
@@ -156,7 +156,6 @@ launchKernel<"""

 class GatherGemmScatterManifest(Manifest):
    def emit(self, target=GeneratorTarget.Library):
-
        operation_emitters = {
            GeneratorTarget.Library: GatherGemmScatterEmitOperationKindLibrary
        }

--- a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py
+++ b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py
@@ -89,7 +89,6 @@ struct ${operation_name} {
        return ""

    def emit(self, operation):
-
        threadblock_shape = operation.tile_description.threadblock_shape
        warp_count = operation.tile_description.warp_count

@@ -107,7 +106,6 @@ struct ${operation_name} {
            and operation.B.layout in transpose_layouts.keys()
            and operation.C.layout in transpose_layouts.keys()
        ):
-
            instance_layout_A = transpose_layouts[operation.A.layout]
            instance_layout_B = transpose_layouts[operation.B.layout]
            instance_layout_C = transpose_layouts[operation.C.layout]
@@ -124,7 +122,6 @@ struct ${operation_name} {

        # Support built-in epilogue functors or user-defined functions
        if isinstance(operation.epilogue_functor, enum.Enum):
-
            epilogue_vector_length = (
                min(
                    operation.C.alignment * DataTypeSize[operation.C.element],
@@ -256,7 +253,6 @@ namespace sparse {
        return self

    def __exit__(self, exception_type, exception_value, traceback):
-
        # Write instance definitions in top-level namespace
        for instance_definition in self.instance_definitions:
            self.configuration_file.write(instance_definition)
@@ -278,7 +274,6 @@ class GatherGemmScatterOperation(GemmOperation):
        epilogue_functor=EpilogueFunctor.LinearCombination,
        swizzling_functor=SwizzlingFunctor.Identity8,
    ):
-
        super().__init__(
            gemm_kind,
            arch,

--- a/python/paddle/amp/accuracy_compare.py
+++ b/python/paddle/amp/accuracy_compare.py
@@ -458,7 +458,6 @@ class ExcelWriter:
    def add_worksheet(
        self, mp_tensor_info_list, sheetname, loss_scale, skip_normal_tensors
    ):
-
        assert self.workbook is not None

        worksheet = self.workbook.add_worksheet(sheetname)

--- a/python/paddle/amp/debugging.py
+++ b/python/paddle/amp/debugging.py
@@ -137,7 +137,6 @@ class TensorCheckerConfig:
        debug_step=None,
        stack_height_limit=1,
    ):
-
        self.enable = enable
        self.debug_mode = debug_mode
        self.output_dir = output_dir

--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -98,7 +98,6 @@ class AmpScaler:
        decr_every_n_nan_or_inf=1,
        use_dynamic_loss_scaling=True,
    ):
-
        tracer = _dygraph_tracer()
        if not tracer:
            raise ValueError(

--- a/python/paddle/cost_model/cost_model.py
+++ b/python/paddle/cost_model/cost_model.py
@@ -52,7 +52,6 @@ class CostModel:
        device='gpu',
        fetch_cost_list=['time'],
    ):
-
        place = paddle.set_device('gpu')
        x = np.random.random(size=(10, 1)).astype('float32')
        exe = paddle.static.Executor(place)

--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -151,7 +151,6 @@ def reader_creator(
 ):
    def reader():
        for sentence, predicate, labels in corpus_reader():
-
            sen_len = len(sentence)

            verb_index = labels.index('B-V')

--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -42,7 +42,6 @@ CACHE_DIR = 'voc2012'


 def reader_creator(filename, sub_name):
-
    tarobject = tarfile.open(filename)
    name2mem = {}
    for ele in tarobject.getmembers():

--- a/python/paddle/distributed/auto_parallel/random.py
+++ b/python/paddle/distributed/auto_parallel/random.py
@@ -69,7 +69,6 @@ def parallel_manual_seed(seed):


 def determinate_rng(rank, dims_mapping, process_mesh):
-
    # TODO(JZ-LIANG) Support Mesh with any high rank
    # use a string to unique integer hashing algorithm for seed computation.
    # instead of using offsets to coodinate seed across devices.
@@ -119,7 +118,6 @@ def determinate_rng(rank, dims_mapping, process_mesh):


 def init_auto_parallel_rng():
-
    if not is_enable_auto_rand_ctrl():
        return


--- a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
+++ b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
@@ -319,7 +319,7 @@ class AutoAlignTool:
            assert os.path.isfile(filepath)
            if "vars" in filename:
                assert filename.endswith("pkl")
-                with (open(filepath, "rb")) as f:
+                with open(filepath, "rb") as f:
                    vars_list.append(pickle.load(f))
            elif "program" in filename:
                assert filename.endswith("pdmodel")
@@ -328,7 +328,7 @@ class AutoAlignTool:
                program_list.append(deserialize_program(program_string))
            elif "dist_attr" in filename:
                assert filename.endswith("pkl")
-                with (open(filepath, "rb")) as f:
+                with open(filepath, "rb") as f:
                    dist_attr_list.append(pickle.load(f))

        dist_attr_map = {}

--- a/python/paddle/distributed/auto_parallel/static/cluster.py
+++ b/python/paddle/distributed/auto_parallel/static/cluster.py
@@ -147,7 +147,6 @@ class Device:


 class Link:
-
    default_hop = 1
    default_nic_bandwidth = 24


--- a/python/paddle/distributed/auto_parallel/static/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -1257,7 +1257,6 @@ class Completer:

            # grad ops that have not a corresponding mapping in grad_op_id_to_op_id
            else:
-
                if grad_op.type == 'sum':
                    assert all(map(_is_grad_var_name, grad_op.input_arg_names))
                    output_name = grad_op.output_arg_names[0]
@@ -1382,7 +1381,6 @@ class Completer:
        ]

        for idx in range(first_backward_op_idx, len(ops)):
-
            # complete the initial grad loss op
            if idx == first_backward_op_idx:
                assert ops[idx].type == "fill_constant"
@@ -1656,7 +1654,6 @@ class Completer:
        learning_rate_completed = False

        for idx in range(len(ops)):
-
            # complete the annotation of the optimizer op.
            # TODO to add attribute for moment var
            op = ops[idx]
@@ -1823,7 +1820,6 @@ class Completer:
                        )

                    for input_name in op.desc.input_names():
-
                        if input_name in [
                            'Param',
                            'Grad',

--- a/python/paddle/distributed/auto_parallel/static/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/static/cost_model.py
@@ -316,7 +316,6 @@ class CostModel:
                    if pred.type == CostNodeType.COMPUTATION and (
                        pred_id in graph[node_id][SUCC]
                    ):
-
                        graph[pred_id][SUCC].remove(node_id)
                        graph[node_id][PRED].remove(pred_id)


--- a/python/paddle/distributed/auto_parallel/static/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_context.py
@@ -1154,7 +1154,6 @@ class DistributedOperatorContext:
        return self._exceed_backward_init_op

    def prepare_context(self, src_op):
-
        self._cur_src_op = src_op

        if is_loss_grad_op(src_op):
@@ -1189,14 +1188,12 @@ class BlockState:
        self.backward_to_forward_index_map = {}

    def parse_forward_blocks(self, program):
-
        while program.current_block_idx != 0:
            program._rollback()

        assert program.current_block_idx == 0

        for idx, block in enumerate(program.blocks):
-
            assert idx == block.idx, "index doesn't match"
            assert (
                block.forward_block_idx == -1
@@ -1209,14 +1206,12 @@ class BlockState:
        assert self.nblock >= 1

    def parse_backward_blocks(self, program):
-
        assert 0 in self.forward_indices, "forward block idx are{}".format(
            self.forward_indices
        )
        self.backward_to_forward_index_map[0] = 0

        for idx, block in enumerate(program.blocks):
-
            if idx < len(self.forward_indices):
                continue


--- a/python/paddle/distributed/auto_parallel/static/dist_saver.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_saver.py
@@ -152,7 +152,6 @@ class DistributedSaver:
        return state_dict, dist_attr

    def save_inference_model(self, path, feed_vars, fetch_vars, exe, **kwargs):
-
        dirname, filename = _process_path(path)

        # save distributed inference program

--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -124,7 +124,6 @@ class Engine:
        cluster=None,
        strategy=None,
    ):
-
        if (
            model
            and not isinstance(model, paddle.nn.Layer)
@@ -1411,7 +1410,6 @@ class Engine:
        epochs=1,
        steps_per_epoch=None,
    ):
-
        dist_context = self._dist_contexts[self._mode]
        dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
        dist_startup_prog = dist_context.dist_startup_programs[self._cur_rank]
@@ -1472,7 +1470,6 @@ class Engine:
        steps_per_epoch=None,
        collate_fn=None,
    ):
-
        dist_context = self._dist_contexts[self._mode]
        dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
        dist_startup_prog = dist_context.dist_startup_programs[self._cur_rank]

--- a/python/paddle/distributed/auto_parallel/static/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/common.py
@@ -338,7 +338,6 @@ def set_comm_op_dist_attr_for_program(


 def naive_copy_op_dist_attr_for_program(new_op, ref_op, ctx):
-
    ref_dist_attr = ctx.get_op_dist_attr_for_program(ref_op)
    new_op_dist_attr = OperatorDistAttr()
    new_op_dist_attr.process_mesh = ref_dist_attr.process_mesh

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py
@@ -77,7 +77,6 @@ class DistributedCheckFiniteAndUnscaleImpl(DistributedOperatorImpl):

    @staticmethod
    def backward(ctx, *args, **kwargs):
-
        # by now the backward function only insert the gradient allreduce for dist op itself
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.main_block

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
@@ -570,7 +570,6 @@ class DistributedDefaultImpl0(DistributedOperatorImpl):

    @staticmethod
    def backward(ctx, *args, **kwargs):
-
        # by now the backward function only insert the gradient allreduce for dist op itself
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
@@ -52,7 +52,6 @@ class DistributedDropoutImpl0(DistributedElementwiseImpl0):

    @staticmethod
    def forward(ctx, *args, **kwargs):
-
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        startup_block = dist_op_context.startup_block
@@ -61,7 +60,6 @@ class DistributedDropoutImpl0(DistributedElementwiseImpl0):
        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)

        if is_enable_auto_rand_ctrl() and not op_dist_attr.is_recompute:
-
            assert (
                op_dist_attr is not None
            ), f"forward op [{str(src_op)}] don't have dist attribute !"

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
@@ -70,7 +70,6 @@ register_distributed_operator_impl_container(


 def adopt_lookup_table_v1(ctx, main_block, src_op, Ids_var):
-
    assert (
        len(Ids_var.shape) == 3
    ), "input Ids to lookup_table should have 3 dimensions but got [{}] with shape [{}]".format(
@@ -577,7 +576,6 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):

    @staticmethod
    def backward(ctx, *args, **kwargs):
-
        # by now the backward function only insert the gradient allreduce for dist op itself
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_fill_constant_batch_size_like.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fill_constant_batch_size_like.py
@@ -75,7 +75,6 @@ class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
        return res_cost

    def is_input_compatible(self, dist_op):
-
        return True

    def is_output_compatible(self, dist_op):

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_flash_attn.py
@@ -52,7 +52,6 @@ class DistributedFlashAttnImpl0(DistributedElementwiseImpl0):

    @staticmethod
    def forward(ctx, *args, **kwargs):
-
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        startup_block = dist_op_context.startup_block
@@ -65,7 +64,6 @@ class DistributedFlashAttnImpl0(DistributedElementwiseImpl0):
            and not op_dist_attr.is_recompute
            and rank_id in op_dist_attr.process_mesh.process_ids
        ):
-
            assert (
                op_dist_attr is not None
            ), f"forward op [{str(src_op)}] don't have dist attribute !"

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_attention.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_attention.py
@@ -154,7 +154,6 @@ class DistributedFusedAttentionImpl(DistributedOperatorImpl):

    @staticmethod
    def forward(ctx, *args, **kwargs):
-
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        startup_block = dist_op_context.startup_block

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_dropout_add.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_dropout_add.py
@@ -71,7 +71,6 @@ class DistributedDropoutImpl0(DistributedElementwiseImpl0):
        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)

        if is_enable_auto_rand_ctrl() and not op_dist_attr.is_recompute:
-
            assert (
                op_dist_attr is not None
            ), f"forward op [{str(src_op)}] don't have dist attribute !"

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_fused_feedforward.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_fused_feedforward.py
@@ -146,7 +146,6 @@ class DistributedFusedFeedForwardImpl(DistributedOperatorImpl):

    @staticmethod
    def forward(ctx, *args, **kwargs):
-
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        startup_block = dist_op_context.startup_block
@@ -188,7 +187,6 @@ class DistributedFusedFeedForwardImpl(DistributedOperatorImpl):

    @staticmethod
    def backward(ctx, *args, **kwargs):
-
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        startup_block = dist_op_context.startup_block

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
@@ -305,7 +305,6 @@ def _is_auto_compatible_for_matmul(dist_op):


 def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
-
    # by now the backward function only insert the gradient allreduce for dist op itself

    dist_op_context = ctx.dist_op_context
@@ -386,7 +385,6 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
            break

    if is_parameter_related(Y_var.name, main_block) and Y_var_partitioned:
-
        if Y_var_dim_mapping[0] >= 0:
            # row parallel: c_identity + matmul
            assert Y_var_dim_mapping[1] < 0
@@ -541,7 +539,6 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):


 def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
-
    if Weight_var.name in dist_op_context.already_init_sync_vars:
        return
    assert startup_block.has_var(Weight_var.name)

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py
@@ -157,7 +157,6 @@ class DistributedPNormImpl0(DistributedOperatorImpl):

    @staticmethod
    def forward(ctx, *args, **kwargs):
-
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        src_op = dist_op_context.cur_src_op
@@ -271,7 +270,6 @@ class DistributedPNormImpl0(DistributedOperatorImpl):

    @staticmethod
    def backward(ctx, *args, **kwargs):
-
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        backward_op = dist_op_context.cur_src_op

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
@@ -78,7 +78,6 @@ class DistributedReduceSumPrimtiveImpl0(DistributedOperatorImpl):

    @staticmethod
    def forward(ctx, *args, **kwargs):
-
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        startup_block = dist_op_context.startup_block

--- a/python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py
@@ -66,7 +66,6 @@ class DistributedUpdateLossScalingImpl(DistributedOperatorImpl):

    @staticmethod
    def backward(ctx, *args, **kwargs):
-
        # the backward function only filter the gradient with current rank id
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.main_block

--- a/python/paddle/distributed/auto_parallel/static/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer.py
@@ -143,7 +143,6 @@ class AutoParallelizer:
        no_grad_set,
        callbacks,
    ):
-
        with program_guard(main_program, startup_program):
            params_grads = append_backward(
                loss,
@@ -158,7 +157,6 @@ class AutoParallelizer:
        return params_grads

    def _apply_optimize(self, main_program, startup_program, params_grads):
-
        optimizer = copy.deepcopy(self._optimizer)
        with program_guard(main_program, startup_program):
            optimize_ops = optimizer.apply_gradients(params_grads)
@@ -173,7 +171,6 @@ class AutoParallelizer:
    def _apply_post_optimization_passes(
        self, main_program, startup_program, rank, params_grads
    ):
-
        if self._dist_strategy.sharding:
            config = copy.deepcopy(self._dist_strategy.sharding_configs)
            config["dist_context"] = self._dist_context

--- a/python/paddle/distributed/auto_parallel/static/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/static/partitioner.py
@@ -112,7 +112,6 @@ class Partitioner:
    def partition_startup_program(
        self, serial_main_program, serial_startup_program
    ):
-
        if not isinstance(serial_startup_program, (Program)):
            raise TypeError(
                "dist_context be paddle.framework.Program, got %s here"
@@ -232,7 +231,6 @@ class Partitioner:
        return partitioned_main_prog, partitioned_params_and_grads

    def partition_block(self, ref_block, target_block):
-
        dist_op_context = self._dist_context.dist_op_context
        serial_ops = ref_block.ops

@@ -256,7 +254,6 @@ class Partitioner:
        # partition
        appended_grad_times = 0
        for idx, op in enumerate(serial_ops):
-
            op_dist_attr = self._dist_context.get_op_dist_attr_for_program(op)
            if is_backward_op(op) and (
                is_forward_op(serial_ops[idx - 1])
@@ -358,7 +355,6 @@ class Partitioner:
                )

    def _is_valid_annotated_program(self, program):
-
        # TODO (ZJ-LIANG) should check all block
        ops = program.global_block().ops
        vars_ = program.list_vars()
@@ -381,7 +377,6 @@ class Partitioner:
        return all_ops_annotated and all_vars_annotated

    def _get_dist_var_by_serial_var(self, serial_var, partitioned_main_prog):
-
        block_idx = serial_var.block.idx
        target_block = partitioned_main_prog.blocks[block_idx]
        dist_var_name = self._serial2dist_varname_mapping[serial_var.name]
@@ -390,7 +385,6 @@ class Partitioner:


 def _get_dist_shape(var, dist_attr):
-
    var_shape = var.shape
    mapping = dist_attr.dims_mapping
    mesh = dist_attr.process_mesh.shape

--- a/python/paddle/distributed/auto_parallel/static/process_group.py
+++ b/python/paddle/distributed/auto_parallel/static/process_group.py
@@ -58,7 +58,6 @@ def remove_process_group(ring_id):
 def new_process_group(
    ranks, group_id=None, force_new_group=False, group_type=None
 ):
-
    global _g_process_group_map
    if not force_new_group:
        # A key constructed from ranks is used for avoiding duplication

--- a/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
@@ -106,7 +106,6 @@ def new_algorithm(name, config):

 @register_algor("sharding")
 class ShardingStageAlgorithm(AlgorithmBase):
-
    # TODO import trial class & copy strategy
    def __init__(self, config):
        super().__init__(config)
@@ -131,9 +130,7 @@ class ShardingStageAlgorithm(AlgorithmBase):
        self._total_num_trial = len(self._stage_range)

    def next_trial(self):
-
        if self._trial_idx < self._total_num_trial:
-
            stage = self._stage_range[self._trial_idx]

            new_strategy = copy.deepcopy(self._config.dist_strategy)
@@ -148,7 +145,6 @@ class ShardingStageAlgorithm(AlgorithmBase):
            return Trial(None, None, None, status=TrialStatus.STOPPED)

    def update(self, results):
-
        et = results.get("ErrorType", None)
        if et and et == "ResourceExhaustedError":
            self._trial_idx = self._total_num_trial
@@ -211,7 +207,6 @@ class ReccomputeCheckpointAlgorithm(AlgorithmBase):
            return Trial(None, None, None, status=TrialStatus.STOPPED)

    def update(self, results):
-
        et = results.get("ErrorType", None)
        if self._recompute_mode == "all":
            if et and et == "ResourceExhaustedError":

--- a/python/paddle/distributed/auto_parallel/static/tuner/config.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/config.py
@@ -33,7 +33,6 @@ class TuningConfig:
    """

    def __init__(self, strategy):
-
        if not isinstance(strategy, Strategy):
            raise TypeError("'strategy' must be object of class `Strategy`.")


--- a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
@@ -111,7 +111,6 @@ def parse_results(results):
 # TODO only dependent on dist context
 # all env need to be start a new pass are member of dist context
 def _copy_context(ref_dist_context):
-
    # clear all process groups and recover the world process group
    clear_all_process_groups()
    ranks = []
@@ -210,7 +209,6 @@ class OptimizationTuner:
        batch_size,
        rank,
    ):
-
        self._config = TuningConfig(dist_context.strategy)
        # should not modify dist context from calling function
        self._baseline_dist_context = _copy_context(dist_context)
@@ -250,7 +248,6 @@ class OptimizationTuner:
    # TODO Generate compelet program with all parts like forward, backward, update
    # as well as parallelism transformation.
    def _build_programs_without_optimization(self):
-
        serial_main_program = self._baseline_dist_context.serial_main_program
        serial_startup_program = (
            self._baseline_dist_context.serial_startup_program
@@ -287,7 +284,6 @@ class OptimizationTuner:
            )

    def _select_tuning_algorithm(self):
-
        selected_passes_set = self._config.tuning_passes_name
        algorithm_name = "_".join(sorted(selected_passes_set))
        self._algorithm = new_algorithm(algorithm_name, self._config)
@@ -415,7 +411,6 @@ class OptimizationTuner:
        return trial

    def _get_profile_context(self, trial, result_path):
-
        profile_ctx = {}

        profile_ctx['distributed_env'] = copy.deepcopy(
@@ -446,7 +441,6 @@ class OptimizationTuner:
        return input_names

    def _launch_profile(self, ctx_path, trial_dir):
-
        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
        else:
@@ -528,7 +522,6 @@ class OptimizationTuner:
            return Error_results

    def _evaluate_trial(self, trial):
-
        self._logger.info(f"Trial {trial.name} evaluation start.")
        self._apply_optimization(trial)


--- a/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
@@ -96,7 +96,6 @@ def init_process_groups(group_map, rank):


 def get_cpp_error_type(error):
-
    msg = str(error).splitlines()
    cpp_error_types = [
        'InvalidArgumentError',
@@ -123,7 +122,6 @@ def get_cpp_error_type(error):
 def create_dataloader(
    main_program, startup_program, profile_ctx, epochs=1, steps_per_epoch=None
 ):
-
    dataset = profile_ctx["dataset"]
    main_block = main_program.global_block()
    feed_list = []
@@ -274,7 +272,6 @@ def profiler(args):
        data_loader._inner_dataloader.reset()

    except Exception as e:
-
        error_type = get_cpp_error_type(e)
        result_dict = {
            "Throughtput": -1,

--- a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
@@ -1822,7 +1822,6 @@ class RuleBasedTuner:
                            )

                        for input_name in op.desc.input_names():
-
                            if input_name in [
                                'Param',
                                'Grad',

--- a/python/paddle/distributed/auto_parallel/static/tuner/trial.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/trial.py
@@ -130,7 +130,6 @@ class OptimizationTunerTrial(Trial):
        return self._name

    def summary(self):
-
        spacing = 2
        max_k = 38
        max_v = 38

--- a/python/paddle/distributed/auto_parallel/static/utils.py
+++ b/python/paddle/distributed/auto_parallel/static/utils.py
@@ -422,7 +422,6 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):


 def _get_corresponding_rank(dist_context, target_mesh, rank):
-
    # TODO(JZ-LIANG) a hack method to support varying mesh in Pipeline parallelism case.
    # we assume that all mesh are evenly divide from a parent mesh and should have same size.
    # to revise this in future.
@@ -1190,7 +1189,6 @@ def set_grad_var_shape(program, dist_context):
    grad_var_to_var = dist_context.dist_op_context.grad_var_to_var

    for idx, op in enumerate(block.ops):
-
        if int(op.attr('op_role')) != int(OpRole.Backward):
            continue

@@ -1210,7 +1208,6 @@ def set_grad_var_shape(program, dist_context):
        assert op_dist_attr is not None

        for var_name in op.output_arg_names:
-
            if "@GRAD" not in var_name:
                continue
            if var_name in grad_var_to_var[appended_grad_times]:
@@ -1809,7 +1806,6 @@ def to_list(value):


 def debug_program(program, path, name):
-
    filename = os.path.join(
        path, name + '_program' + ".%d" % (paddle.distributed.get_rank())
    )
@@ -1827,7 +1823,6 @@ def ring_id_to_process_group(ring_id):


 def find_higher_order_backward_op(program):
-
    higher_order_op_suffix = ['_grad_grad', 'triple_grad']
    for block in program.blocks:
        for op in block.ops:
@@ -2237,7 +2232,6 @@ def insert_dependencies_for_two_ops(
    )

    def _select_best_depend_var(vars):
-
        # parameter should not be dep var since it maybe partition in sharding pass
        vars = [var for var in vars if not var.is_parameter]
        assert len(vars) > 0

--- a/python/paddle/distributed/communication/stream/gather.py
+++ b/python/paddle/distributed/communication/stream/gather.py
@@ -58,7 +58,6 @@ def gather(
    sync_op=True,
    use_calc_stream=False,
 ):
-
    """

    Gather tensors from all participators.
@@ -120,7 +119,6 @@ def gather(
            )
        gather_list = []
    else:
-
        assert (
            gather_list is not None
        ), "gather_list must not be none for dst rank"

--- a/python/paddle/distributed/elastic.py
+++ b/python/paddle/distributed/elastic.py
@@ -44,7 +44,6 @@ class Command:


 if __name__ == '__main__':
-
    parser = argparse.ArgumentParser(description='Elastic Command')
    parser.add_argument(
        "--elastic_server", type=str, help="etcd server host:port"

--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -83,7 +83,6 @@ class Gloo:
        need_init_all=False,
        kwargs=None,
    ):
-
        self._rendezvous = rendezvous
        self._role = role
        self._role_id = role_id

--- a/python/paddle/distributed/fleet/elastic/__init__.py
+++ b/python/paddle/distributed/fleet/elastic/__init__.py
@@ -47,7 +47,6 @@ def enable_elastic(args, distribute_mode):


 def launch_elastic(args, distribute_mode):
-
    server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER')
    srv, port = server.split(':')
    import etcd3
@@ -60,7 +59,6 @@ def launch_elastic(args, distribute_mode):
    signal.signal(signal.SIGINT, elastic.signal_handler)

    while True:
-
        # wait for all nodes ready to run
        elastic.wait()


--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -123,7 +123,6 @@ class LauncherInterface:

 class ElasticManager:
    def __init__(self, args, etcd_client):
-
        self.args = args
        server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER')
        name = args.job_id or os.getenv('PADDLE_ELASTIC_JOB_ID')
@@ -603,7 +602,6 @@ class ElasticManager:
        self.launcher.launch()

    def watch(self):
-
        if self.need_sync:
            self.need_sync = False


--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -494,7 +494,6 @@ def run_with_coverage(*args):
 def start_local_trainers(
    cluster, pod, training_script, training_script_args, log_dir=None, envs=None
 ):
-
    if envs is None:
        current_env = copy.copy(os.environ.copy())
    else:

--- a/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py
@@ -59,7 +59,6 @@ class ASPOptimizer(MetaOptimizerBase):
    def minimize_impl(
        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
    ):
-
        optimize_ops, params_grads = ASPHelper._minimize(
            self.inner_opt,
            loss,

--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -51,7 +51,6 @@ class DygraphShardingOptimizer:
        inner_optimizer_class,
        **inner_optimizer_kargs
    ):
-
        if not isinstance(params, list):
            raise TypeError(
                "`parameters` argument given to the DygraphShardingOptimizer should be "
@@ -89,7 +88,6 @@ class DygraphShardingOptimizer:
                p.clear_gradient()

    def _build_sharding_mapping(self):
-
        self._rank2params = self._partition_parameters()
        self._param2rank = self._map_param_to_rank()

@@ -172,7 +170,6 @@ class DygraphShardingOptimizer:
    def minimize(
        self, loss, startup_program=None, parameters=None, no_grad_set=None
    ):
-
        # NOTE in dygraph mode, the only different between step and minimize is that minimize
        # allow user to customize the parameters for updating on each step


--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
@@ -51,7 +51,6 @@ class HeterParallelOptimizer:
    def minimize(
        self, loss, startup_program=None, parameters=None, no_grad_set=None
    ):
-
        # minimize does not support parameters in the form of param_group,
        # so no need use _obtain_optimizer_parameters_list
        parameter_list = (

--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -428,7 +428,6 @@ class HybridParallelOptimizer:
    def minimize(
        self, loss, startup_program=None, parameters=None, no_grad_set=None
    ):
-
        # minimize does not support parameters in the form of param_group,
        # so no need use _obtain_optimizer_parameters_list
        parameter_list = (

--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -196,7 +196,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
        )

        if not compiled_config.is_geo_mode():
-
            from paddle.incubate.distributed.fleet.parameter_server.ir.public import (
                _get_optimize_ops,
            )

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -47,7 +47,6 @@ class ProgramDeps:
    def _build_deps(
        self,
    ):
-
        for var_name in self._start_vars:
            self._var_to_use_op[var_name] = []
            self._var_to_generate_op[var_name] = []

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -834,7 +834,6 @@ def get_grad_device(grad_name, shard):


 def get_first_check_finite_and_unscale_op_idx(block, raise_error=True):
-
    for idx, op in enumerate(block.ops):
        if op.type == "check_finite_and_unscale":
            return idx

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -848,7 +848,6 @@ class ShardingOptimizer(MetaOptimizerBase):
            elif self._sharding_segment_strategy == "segment_anchors":
                if int(op.attr('op_role')) == int(OpRole.Backward):
                    for input_name in op.desc.input_arg_names():
-
                        # NOTE (JZ-LIANG) naive rule to support amp, if amp change, should modify here accordingly
                        if self.user_defined_strategy.amp:
                            if ".cast_fp16@GRAD" not in input_name:
@@ -1766,7 +1765,6 @@ class ShardingOptimizer(MetaOptimizerBase):
    def create_persistable_gradients_and_insert_merge_ops(
        self, main_block, startup_block, insert_idx, grad_names, shard
    ):
-
        for grad_name in grad_names:
            assert (
                get_grad_device(grad_name, shard) == shard.worker_idx

--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -109,7 +109,6 @@ class SegmentLayers:
        ), "layer number should be greater than number of segments"

    def do_segment(self):
-
        if isinstance(self.method, list):
            seg_method = self.method[:]
            source_num_parts = len(seg_method) - 1

--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -357,7 +357,6 @@ def _p2p_helper(
                tensor_recv_prev.append(tmp)
            tensor_recv_prev = tuple(tensor_recv_prev)
        else:
-
            tensor_recv_prev = paddle.empty(
                shape=recv_shape_msg, dtype=number_2_dtype(recv_dtype_msg)
            )

--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -245,7 +245,6 @@ class FusedCommBuffer:


 def assign_group_by_size(parameters, group_size=128 * 1024 * 1024):
-
    group_idx = 0
    memory_counter = 0
    var_groups = OrderedDict()

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -79,7 +79,6 @@ class GroupShardedOptimizerStage2(Optimizer):
        dp_group=None,
        **kw
    ):
-
        super().__init__(learning_rate=optim._learning_rate, parameters=params)
        assert (
            core.is_compiled_with_cuda()
@@ -418,7 +417,6 @@ class GroupShardedOptimizerStage2(Optimizer):

            for dst_rank, params in enumerate(per_rank_params):
                if len(params) > 0:
-
                    # Merge all the trainable params in a single InternalStorage
                    trainable_params = list(
                        filter(lambda x: x.trainable, params)

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -780,7 +780,6 @@ def ForwardPreHooks(
    offload,
    task_flow,
 ):
-
    # Record layer's id
    layer_id = id(layer)
    use_calc, sync_wait = False, False
@@ -837,7 +836,6 @@ class ForwardPostHooks(PyLayer):
        offload,
        task_flow,
    ):
-
        layer_id = id(layer)
        # release current layer full params
        _release_param(
@@ -970,7 +968,6 @@ def _wait_layer(
    use_calc_stream,
    offload=False,
 ):
-
    for param in trainable_params:
        if param.status == "all":
            param.use_count += 1
@@ -1007,7 +1004,6 @@ def _allgather_buffer(
    offload=False,
    convert2cpu=False,
 ):
-
    for param in trainable_params:
        if param.status == "all":
            param.use_count += 1

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
@@ -157,7 +157,6 @@ class ParamStorage(InternalStorage):

    @paddle.autograd.no_grad()
    def _add_param_as_view(self, param, align, convert_gpu=True):
-
        assert (
            param.dtype == self.buffer.dtype
        ), "Different types for the InternalStorage and the param, cannot proceed: {} - {}".format(
@@ -192,7 +191,6 @@ class ParamStorage(InternalStorage):

    @paddle.autograd.no_grad()
    def _convert_buffer(self, param, p_shape, align):
-
        var_end = self._fill + np.prod(p_shape).tolist()
        offset = var_end + align
        assert offset <= self.buffer._numel()

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -238,7 +238,6 @@ def GroupShardedScaler(scaler):
        if getattr(optimizer._optim, '_param_groups', None) and isinstance(
            optimizer._optim._param_groups[0], dict
        ):
-
            for group in optimizer._optim._param_groups:
                for param in group['params']:
                    if param.grad is not None:

--- a/python/paddle/distributed/fleet/model.py
+++ b/python/paddle/distributed/fleet/model.py
@@ -134,7 +134,6 @@ def distributed_model(model):
    if fleet_env._hcg.get_parallel_mode() == ParallelMode.SHARDING_PARALLEL:
        model = ShardingParallel(model, fleet_env._hcg, strategy=strategy)
    elif fleet_env._hcg.get_parallel_mode() == ParallelMode.DATA_PARALLEL:
-
        # NOTE (JZ-LIANG) init parameters broadcast within sharding group
        # normally it should be done inside DataParallel
        if fleet_env.sharding_degree > 1:

--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -29,7 +29,6 @@ __all__ = []


 def _split_activation(tensor, mp_group):
-
    mp_degree = mp_group.nranks
    mp_rank = mp_group.rank
    if mp_degree < 2:
@@ -87,7 +86,6 @@ class _HPRecomputeFunction(PyLayer):
        *args,
        **kwargs,
    ):
-
        # store for recomputing
        ctx.run_function = run_function


--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -354,7 +354,7 @@ class CommonAccessor:
            attr_varnames = self.opt_attr_map[oop.type]
            self.accessor_class = oop.type

-        for (formal_name, shape) in param_varnames:
+        for formal_name, shape in param_varnames:
            params.append(formal_name)
            if self.accessor_class == "adam_d2sum":
                # for dims
@@ -424,7 +424,7 @@ class CommonAccessor:
                    )
                    initializers.append(initializer)

-        for (attr_varname, type_) in attr_varnames:
+        for attr_varname, type_ in attr_varnames:
            value = oop.attr(attr_varname)
            attrs.append("&".join([attr_varname, type_, str(value)]))

@@ -1307,7 +1307,6 @@ class TheOnePSRuntime(RuntimeBase):
    def _save_distributed_persistables(
        self, executor, dirname, main_program, mode=0
    ):
-
        denses = self.compiled_strategy.get_the_one_recv_context(
            is_dense=True,
            split_dense_table=self.role_maker._is_heter_parameter_server_mode,

--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -1223,7 +1223,6 @@ class AFSClient(FS):
        return self._ls_dir(fs_path)

    def _ls_dir(self, fs_path):
-
        files = self._fs.list(fs_path)
        dirs = [fs_path]
        return dirs, files

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -195,7 +195,6 @@ class HybridParallelInferenceHelper:
        init_comm=True,
        role_maker=None,
    ):
-
        assert isinstance(startup_program, Program)
        assert isinstance(main_program, Program)


--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -235,7 +235,6 @@ def sharding_reduce_gradients(parameter_list, hcg):
    # TODO merge grad / nrank with dp
    logger.debug("sharding start gradients sync")
    with framework.no_grad():
-
        sharding_nrank = hcg.get_sharding_parallel_group().nranks
        for param in parameter_list:
            g_var = None

--- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py
+++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -77,7 +77,6 @@ class MixPrecisionLayer(nn.Layer):
        include_sublayers=True,
        structured_name_prefix="",
    ):
-
        return self._layers.state_dict(
            destination=destination,
            include_sublayers=include_sublayers,
@@ -86,7 +85,6 @@ class MixPrecisionLayer(nn.Layer):

    @framework.deprecate_stat_dict
    def set_state_dict(self, state_dict, use_structured_name=True):
-
        self._layers.set_state_dict(
            state_dict, use_structured_name=use_structured_name
        )
@@ -113,7 +111,6 @@ class MixPrecisionOptimizer:
    @imperative_base.no_grad
    @framework.dygraph_only
    def step(self):
-
        if not isinstance(self._parameter_list[0], dict):
            params_grads = []
            for param in self._parameter_list:
@@ -179,7 +176,6 @@ class MixPrecisionOptimizer:

    @framework.dygraph_only
    def clear_grad(self, set_to_zero=True):
-
        param_list = []
        if self._parameter_list is None or not isinstance(
            self._parameter_list[0], dict

--- a/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py
+++ b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py
@@ -65,7 +65,7 @@ class LayerReNamingManager:

    def get_new_layer_name(self, old_name: str):
        layer_name = ""
-        for (k, v) in self._renaming_helpers.items():
+        for k, v in self._renaming_helpers.items():
            if old_name.startswith(k):
                layer_name = v.get_new_layer_name(old_name)
                break
@@ -128,12 +128,12 @@ class PipeLineModelAdaptor:
                ]

                # 4、merge layers belonging to the same node
-                for (layer_segment, dir_) in zip(layer_segments, dst_dirs):
+                for layer_segment, dir_ in zip(layer_segments, dst_dirs):
                    print(f"merge {len(layer_segment)} layers to {dir_}")
                    self.merge_layers(layer_segment, dir_)

                # 5、copy meta_state.pdopt
-                for (src_dir, dst_dir) in zip(src_dirs, dst_dirs):
+                for src_dir, dst_dir in zip(src_dirs, dst_dirs):
                    shutil.copyfile(
                        f"{src_dir}/meta_state.pdopt",
                        f"{dst_dir}/meta_state.pdopt",
@@ -155,7 +155,7 @@ class PipeLineModelAdaptor:

    def peek_partial_model(self, sub_dir: str):
        state_dict = paddle.load(f"{sub_dir}/model.pdparams")
-        for (k, v) in state_dict.items():
+        for k, v in state_dict.items():
            print(f"\t{k} -> {v.name}")

    def extract_layers(self, dir: str, with_shared: bool):
@@ -164,7 +164,7 @@ class PipeLineModelAdaptor:
        shared_layer_parsed = False
        # tname -> (layer, param_name)
        tname_to_layer_and_pname = {}
-        for (k, v) in params.items():
+        for k, v in params.items():
            layer = self._extract_layer_name(k)
            assert layer
            # special treatment for embedding layer, skip duplicated shared layer
@@ -192,7 +192,7 @@ class PipeLineModelAdaptor:
        opt_to_t = self._opt_name_to_tname(tensor_names, opt_names)
        # gather tensors belonging to one layer togather
        layers = OrderedDict()
-        for (k, v) in params.items():
+        for k, v in params.items():
            layer, p = tname_to_layer_and_pname[v.name]
            if layer not in layers:
                layers[layer] = {}
@@ -201,14 +201,14 @@ class PipeLineModelAdaptor:
                layers[layer]["master_weights"] = OrderedDict()
            layers[layer]["params"][p] = v

-        for (k, v) in opt.items():
+        for k, v in opt.items():
            if k in ["master_weights", "LR_Scheduler"]:
                continue
            layer, _ = tname_to_layer_and_pname[opt_to_t[v.name]]
            layers[layer]["opt"][k] = v

        if "master_weights" in opt:
-            for (k, v) in opt["master_weights"].items():
+            for k, v in opt["master_weights"].items():
                layer, _ = tname_to_layer_and_pname[k]
                layers[layer]["master_weights"][k] = v

@@ -218,7 +218,7 @@ class PipeLineModelAdaptor:

        ans = []

-        for (layer_name, layer) in layers.items():
+        for layer_name, layer in layers.items():
            # special treatment for embedding layer
            if (not with_shared) and "shared_layers" in layer_name:
                continue
@@ -311,7 +311,7 @@ class PipeLineModelAdaptor:
        # name layers
        segments = [[] for i in range(config.pp)]
        for i in range(config.pp):
-            for (start, end) in index_segments[i]:
+            for start, end in index_segments[i]:
                for j in range(start, end):
                    if config.vpp > 1:
                        segments[i].append(
@@ -338,7 +338,7 @@ class PipeLineModelAdaptor:
            for i in range(1, config.pp):
                segments[i] = [([layers[0][0]], layers[0][1])] + segments[i]

-        for (pp_rank, segs) in enumerate(segments):
+        for pp_rank, segs in enumerate(segments):
            print(f"segmentment result for pp_rank {pp_rank}:")
            print(50 * "=")
            for seg in segs:
@@ -352,12 +352,12 @@ class PipeLineModelAdaptor:
        renaming_manager = LayerReNamingManager()

        def merge(src, dst, map_k=None):
-            for (k, v) in src.items():
+            for k, v in src.items():
                k = map_k(k) if map_k is not None else k
                dst[k] = v

        lr_scheduler = None
-        for (layer_names, file_path) in layers_segment:
+        for layer_names, file_path in layers_segment:
            print("load %s" % file_path)
            layer = paddle.load(file_path)

@@ -425,14 +425,14 @@ class PipeLineModelAdaptor:
        # old name to new name
        t_name_mapping = {}
        # map tensor names
-        for (k, v) in params.items():
+        for k, v in params.items():
            t_name_mapping[v.name] = renaming_manager.get_new_param_name(v.name)
            v.name = t_name_mapping[v.name]
        # map opt names
        opt_to_tname = self._opt_name_to_tname(
            t_name_mapping.keys(), opt.keys()
        )
-        for (k, v) in opt.items():
+        for k, v in opt.items():
            old_t_name = opt_to_tname[k]
            t_name = t_name_mapping[old_t_name]
            opt_name = t_name + k[len(old_t_name) :]
@@ -440,7 +440,7 @@ class PipeLineModelAdaptor:
            opt_renamed[opt_name] = v

        # map master names
-        for (k, v) in master_weights.items():
+        for k, v in master_weights.items():
            t_name = t_name_mapping[k]
            v.name = t_name + v.name[len(k) :]
            master_weights_renamed[t_name] = v
@@ -448,7 +448,6 @@ class PipeLineModelAdaptor:


 def parse_args():
-
    parser = argparse.ArgumentParser(
        prog='model converter', description='converter a model'
    )
@@ -591,7 +590,6 @@ def adaptor_from_args(args):


 def main():
-
    args = parse_args()
    adaptor = adaptor_from_args(args)
    if args.method == "peek_model":

--- a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
@@ -120,7 +120,6 @@ def copy_parameters(block_, params):
 def insert_sync_op(
    block, idx, tp_degree, sync_mode, sync_ring_id, src_rank, varname, op_role
 ):
-
    if sync_mode == "broadcast":
        block._insert_op_without_sync(
            idx,
@@ -171,11 +170,9 @@ def insert_synchronization(
    sync_mode,
    src_rank,
 ):
-
    unsync_param_names = [p.name for p in params_to_sync]

    for idx, op in reversed(list(enumerate(block.ops))):
-
        if op.type in _supported_optimizer_type:
            assert "Param" in op.input_names
            assert len(op.input("Param")) == 1
@@ -183,7 +180,6 @@ def insert_synchronization(
            op_role = op.attr(OP_ROLE_KEY)

            if param_name in unsync_param_names:
-
                unsync_param_names.remove(param_name)

                # Param sync after opt

--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -262,13 +262,11 @@ class CollectiveElasticController(CollectiveController):
        self.master.register_heartbeat(self.job.id, self.pod.name)

    def run(self):
-
        timeout = int(self.ctx.args.elastic_timeout)
        timeout = timeout if self.job.elastic else timeout * 10
        self.register()

        while self.pod.restart <= self.ctx.args.max_restart:
-
            self.build_job()

            self.ctx.logger.info("Waiting peer ready...")

--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -54,7 +54,6 @@ class ControllerBase:
        self.join_server = None

    def deploy_pod(self):
-
        assert (
            len(self.pod.containers) + len(self.pod.init_containers) > 0
        ), "No container in the pod"
@@ -219,7 +218,6 @@ class Controller(ControllerBase):
        log_file=None,
        is_init=False,
    ):
-
        if not container:
            container = self.new_container(
                entrypoint=entrypoint, envs=envs, out=log_file, err=log_file

--- a/python/paddle/distributed/launch/controllers/master.py
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -136,7 +136,6 @@ class HTTPMaster(Master):
        self._stop_server()

    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
-
        if size < 2:
            return [value], 0


--- a/python/paddle/distributed/launch/controllers/ps.py
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -115,7 +115,6 @@ class PSController(Controller):
            self.add_container(envs=e, log_file=log_file)

    def _build_pod_with_master(self):
-
        self.pod.rank = int(self.ctx.args.rank)

        server_num = self.ctx.args.server_num or 1

--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -287,14 +287,12 @@ def launch():
    ctx = Context()

    if ctx.is_legacy_mode():
-
        # legacy mode
        from paddle.distributed.fleet import launch

        launch.launch()

    else:
-
        from . import controllers

        # initialize the selected controller

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -870,7 +870,6 @@ def _is_cpuonly(backend):
        backend in ['auto', 'nccl', 'bkcl', 'heter']
        and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu())
    ) or backend == 'xccl':
-
        # passes 'auto' and can use cuda or xpu, use the default logics. so return False
        return False
    else:

--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -231,7 +231,6 @@ class AMPState:
        return is_train

    def _mark_black_white_ops(self, op, ops, block):
-
        # ernie inference trick
        if op.type == "assign" and "array_" in op.input_arg_names[0]:
            self._op_fp16_dict[op.desc.original_id()] = False
@@ -814,7 +813,6 @@ class AMPPass(PassBase):
        main_block._sync_with_cpp()

    def _check_and_update_gradient(self):
-
        main_block = paddle.static.default_main_program().global_block()
        main_block._sync_with_cpp()

@@ -916,7 +914,6 @@ class AMPPass(PassBase):
            )

    def _cast_loss(self):
-
        main_block = paddle.static.default_main_program().global_block()
        main_block._sync_with_cpp()

@@ -928,7 +925,6 @@ class AMPPass(PassBase):
        )

        if loss.dtype != core.VarDesc.VarType.FP32:
-
            tmp_name = unique_name.generate(loss.name + ".cast_fp32")
            cast_loss = main_block.create_var(
                name=tmp_name, dtype=core.VarDesc.VarType.FP32
@@ -1010,7 +1006,6 @@ class AMPPass(PassBase):
        main_block._sync_with_cpp()

    def _scale_loss(self):
-
        main_block = paddle.static.default_main_program().global_block()
        loss = self.get_attr("loss")
        assert loss is not None
@@ -1023,7 +1018,6 @@ class AMPPass(PassBase):
            self.get_attr("use_dynamic_loss_scaling")
            or self.get_attr("init_loss_scaling") != 1.0
        ):
-
            loss_op_idx = find_op_index(main_block.desc, loss_op.desc)

            # forward
@@ -1123,7 +1117,6 @@ class AMPPass(PassBase):
        main_block._sync_with_cpp()

    def _update_loss_scaling(self, grads, found_inf):
-
        main_block = paddle.static.default_main_program().global_block()
        main_block._sync_with_cpp()


--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -91,7 +91,6 @@ class DataParallelOptimizationPass(PassBase):
        return PassType.COMM_OPT

    def _apply_single_impl(self, main_program, startup_program, context):
-
        self.dist_context = self.get_attr("dist_context")
        self.global_rank = int(self.get_attr("global_rank"))
        self.use_sharding = self.get_attr("use_sharding")
@@ -110,7 +109,6 @@ class DataParallelOptimizationPass(PassBase):
                self.summary(grad_group)

    def _prune_grad_scaling(self):
-
        if not self._could_be_prune():
            return

@@ -128,7 +126,6 @@ class DataParallelOptimizationPass(PassBase):
        self._calc_wait_comms()

    def _fuse_allreduce(self):
-
        if not self._could_be_fuse():
            return []

@@ -149,7 +146,6 @@ class DataParallelOptimizationPass(PassBase):
        scaled_grads = []

        for op in ops:
-
            if is_data_parallel_reduce_op(op):
                grad_name = op.output_arg_names[0]
                if grad_name in self._grad_name_to_group_map:
@@ -198,7 +194,6 @@ class DataParallelOptimizationPass(PassBase):
        return len(self._group_to_grad_name_map) > 0

    def _could_be_prune(self):
-
        return self.dist_context.gradient_scale and (
            self._support_rescale_grad or self._all_dp_groups_same_degree()
        )
@@ -215,7 +210,6 @@ class DataParallelOptimizationPass(PassBase):
        )

    def _scale_backward_initial_grad(self):
-
        block = default_main_program().global_block()
        dp_degree = len(list(self._group_to_grad_name_map.keys())[0].ranks)

@@ -241,7 +235,6 @@ class DataParallelOptimizationPass(PassBase):
        block._sync_with_cpp()

    def _update_opt_rescale_grad(self):
-
        block = default_main_program().global_block()
        scaled_grads = set()

@@ -313,7 +306,6 @@ class DataParallelOptimizationPass(PassBase):
        block._sync_with_cpp()

    def _calc_wait_comms(self):
-
        return

        block = default_main_program().global_block()
@@ -365,7 +357,6 @@ class DataParallelOptimizationPass(PassBase):
        # here we try to wait for all kernel in that comm stream to be finish which is not that optimized.
        for i in sorted(indices, reverse=True):
            for ring_id in op_idx_to_sync_ring_id_map[i]:
-
                block._insert_op_without_sync(
                    i,
                    type='c_wait_comm',
@@ -451,13 +442,11 @@ class DataParallelOptimizationPass(PassBase):
        return grad_groups

    def _update_program(self, grad_groups):
-
        block = default_main_program().global_block()

        remove_op_types = ['scale', 'c_allreduce_sum', 'c_wait_compute']

        for i, group in enumerate(grad_groups[::-1]):
-
            # skip unfused big tensor
            if len(group.gradients) <= 1:
                group.coalesce_var = group.gradients[0]

--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -194,12 +194,10 @@ class FP16State:
        return self.is_train

    def _mark_op(self, op):
-
        if op.type in __amp_skip_ops__:
            return

        if is_forward_op(op):
-
            # ernie inference trick
            if op.type == "assign" and "array_" in op.input_arg_names[0]:
                self._op_fp16_dict[op.desc.original_id()] = False
@@ -227,7 +225,6 @@ class FP16State:
                self.forward_non_leaf_tensors[var_name] = op.desc.id()

        elif is_backward_op(op) == int(OpRole.Backward):
-
            if op.desc.original_id() in self.grad_op_to_op_map:
                fwd_op_id = self.grad_op_to_op_map[op.desc.original_id()]
                assert fwd_op_id in self._op_fp16_dict, f"{str(op)}"
@@ -259,7 +256,6 @@ class FP16State:
            var.desc.set_dtype(__target_dtype__)

    def resolute_tensor_dtype(self, block):
-
        for op in block.ops:
            if is_forward_op(op):
                # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
@@ -382,7 +378,6 @@ class FP16State:
    def _insert_forward_cast_ops(
        self, op, idx, block, src_dtype, dst_dtype, dist_context
    ):
-
        num_cast_ops = 0

        for in_name in op.input_names:
@@ -470,7 +465,6 @@ class FP16State:
    def _insert_backward_cast_ops(
        self, op, idx, block, src_dtype, dst_dtype, dist_context
    ):
-
        num_cast_ops = 0
        op_id = op.desc.id()
        original_id = op.desc.original_id()
@@ -495,11 +489,9 @@ class FP16State:
            src_dtype,
            slot_name,
        ) in self.forward_input_cast_ops[forward_op_id]:
-
            # rename input
            # some forward output is not need by backward computation, e.g. logit in softmax_with_cross_entropy
            if slot_name in op.input_names:
-
                assert src_name in op.input(
                    slot_name
                ), "var: {} not in op's {}. {}".format(
@@ -567,7 +559,6 @@ class FP16State:


 def _check_and_update_gradient(grads, loss_scaling, name, dist_context):
-
    main_block = paddle.static.default_main_program().global_block()
    main_block._sync_with_cpp()


--- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py
+++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
@@ -327,7 +327,6 @@ class ClipGradByGloblNormPass(PassBase):
        self._remove_no_need_ops_vars(block)

    def _remove_no_need_ops_vars(self, block):
-
        removed_op_out_type = [
            'squared_l2_norm',
            'square',
@@ -463,7 +462,6 @@ class ClipGradByGloblNormPass(PassBase):
                    self.clip_helper._init_dist_attr(allreduce_op)

                    if insert_leaf_fill_constant_node:
-
                        # NOTE add naive deps for global norm sync in graph exe
                        j = idx - 1
                        prior_op = None

--- a/python/paddle/distributed/passes/auto_parallel_pipeline.py
+++ b/python/paddle/distributed/passes/auto_parallel_pipeline.py
@@ -92,7 +92,6 @@ class PipelinePass(PassBase):
            )

    def _insert_sync_ops_for_stream(self):
-
        for block in self._program.blocks:
            offset = 0
            send_vars = []
@@ -243,7 +242,6 @@ class PipelinePass(PassBase):

        # set upstream/downstream for task_nodes of cur_rank
        for i, (task_role, task_node) in enumerate(task_nodes.items()):
-
            cur_id = int(self._cur_rank * num_of_functionality + i)
            ups = []
            downs = []

--- a/python/paddle/distributed/passes/auto_parallel_quantization.py
+++ b/python/paddle/distributed/passes/auto_parallel_quantization.py
@@ -65,7 +65,6 @@ class QuantizationPass(PassBase):
        return True

    def _apply_single_impl(self, main_program, startup_program, context):
-
        dist_context = self.get_attr("dist_context")
        params_grads = self.get_attr("params_grads")
        mode = self.get_attr("mode")

--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -225,7 +225,6 @@ class ShardingPass(PassBase):
            )

    def _build_sharding_infos(self, main_block, params_grads):
-
        # order params
        params_grads = re_order_program(
            main_block, params_grads, self._dist_context
@@ -233,7 +232,6 @@ class ShardingPass(PassBase):

        # partition
        for dp_group in self.dp_groups:
-
            assert (
                dp_group.nranks >= self.sharding_world_size
            ), "sharding world size [{}] should not larger than dp world size [{}]".format(
@@ -297,7 +295,6 @@ class ShardingPass(PassBase):
        self._insert_optimizer_broadcasts(main_block, startup_block)

    def _shard_amp_related_op_and_vars(self, main_block):
-
        if self.stage < 2:
            return

@@ -347,7 +344,6 @@ class ShardingPass(PassBase):
        main_block._sync_with_cpp()

    def _shard_gradient_clip(self, main_block):
-
        if self.stage < 2:
            return

@@ -416,7 +412,6 @@ class ShardingPass(PassBase):
        main_block._sync_with_cpp()

    def _shard_weight_decay(self, main_block):
-
        if self.stage < 2:
            return

@@ -430,7 +425,6 @@ class ShardingPass(PassBase):
        main_block._sync_with_cpp()

    def _shard_optimizer_ops_and_states(self, main_block, startup_block):
-
        should_removed_optimizer_states = []
        for idx, op in reversed(list(enumerate(main_block.ops))):
            if not is_optimize_op(op):
@@ -471,7 +465,6 @@ class ShardingPass(PassBase):
        startup_block._sync_with_cpp()

    def _insert_optimizer_broadcasts(self, main_block, startup_block):
-
        if self.stage > 2 or self.param_bucket_size_numel > 1:
            return

@@ -519,7 +512,6 @@ class ShardingPass(PassBase):
        return p_g

    def _shard_gradient_synchronization(self, main_block):
-
        if self.stage < 2:
            return

@@ -562,7 +554,6 @@ class ShardingPass(PassBase):
        main_block._sync_with_cpp()

    def _shard_parameter(self, main_block, startup_block):
-
        if self.stage < 3:
            return

@@ -684,7 +675,6 @@ class ShardingPass(PassBase):
        startup_block._sync_with_cpp()

    def _optimization_pass(self, main_program, startup_program):
-
        if self.stage <= 1:
            return

@@ -712,7 +702,6 @@ class ShardingPass(PassBase):
                    self._fuse_overlap_parameter_comm_stage_three(sharding_info)

    def _gradient_sync_optimization(self, sharding_info):
-
        if self.grad_bucket_size_numel <= 1 and (not self.enable_overlap):
            return

@@ -730,7 +719,6 @@ class ShardingPass(PassBase):
        )

    def _fuse_overlap_parameter_comm_stage_two(self, sharding_info):
-
        main_block = default_main_program().global_block()
        startup_block = default_startup_program().global_block()

@@ -777,7 +765,6 @@ class ShardingPass(PassBase):
            self.op_to_stream_idx = {}

        for i, param_group in enumerate(group_to_param_map.keys()):
-
            assert len(param_group) >= 1
            if len(param_group) > 1:
                coalesce_var_name = unique_name.generate(
@@ -1087,7 +1074,6 @@ class ShardingPass(PassBase):

        # update block
        for idx, op in reversed(list(enumerate(block.ops))):
-
            if idx in modify_reduce_op_map:
                group = modify_reduce_op_map[idx]
                grad_name = op.output_arg_names[0]
@@ -1202,7 +1188,6 @@ class ShardingPass(PassBase):
        grad_comm_op_to_stream_idx = {}
        for idx, op in enumerate(ops):
            if is_data_parallel_reduce_op(op):
-
                if op.type == "c_allreduce_sum":
                    continue
                stream_idx = reduce_op_count % self.grad_comm_stream_num
@@ -1429,7 +1414,6 @@ def _insert_init_and_broadcast_op(
        dist_context,
    )
    if local_rank != root_rank:
-
        new_op = block._insert_op_without_sync(
            insert_idx,
            type="empty",
@@ -1523,7 +1507,6 @@ def _is_param_grad_fp32_cast_op(block, op):


 def _is_param_fp16_cast_op(block, op, params):
-
    if is_optimize_op(op):
        return False
    if not _is_desired_cast_op(block, op):
@@ -1563,7 +1546,6 @@ def _get_base_name_from_grad_name(grad_name):


 def _is_param_grad_allreduce_op(op, block):
-
    if not is_data_parallel_reduce_op(op):
        return False

@@ -1577,7 +1559,6 @@ def _is_param_grad_allreduce_op(op, block):


 def _is_param_grad_sum_op(op, block):
-
    if not is_backward_op(op):
        return False
    if op.type != "sum":
@@ -1601,7 +1582,6 @@ def is_sharding_param_broadcast_op(op):


 def _inference_data_parallel_group_for_operator(rank_id, op, dist_context):
-
    dp_group = None
    for input_name in op.input_arg_names:
        # TODO(zhaoyingli): maintain a dict in dist_context to record all variables which are renamed,
@@ -1696,7 +1676,6 @@ def partition_parameters(params, group_size, algor="greedy_even"):


 def re_order_program(block, param_grads, dist_context):
-
    # record order
    pname_to_pg_pairs = {}
    for p, g in param_grads:

--- a/python/paddle/distributed/passes/auto_parallel_supplement_explicit_dependencies.py
+++ b/python/paddle/distributed/passes/auto_parallel_supplement_explicit_dependencies.py
@@ -67,7 +67,6 @@ class AutoParalSupplementDepPass(PassBase):
        return True

    def _apply_single_impl(self, main_program, startup_program, context):
-
        # TODO general this pass for all case.
        if not _sharding_pass_applied(context):
            return

--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -175,7 +175,6 @@ class BuildCINNPass(CPPPassWrapper):
        return PassType.CALC_OPT

    def _apply_single_impl(self, main_program, startup_program, context):
-
        assert (
            'FLAGS_allow_cinn_ops' in core.globals()
        ), "PaddlePaddle is not compiled with CINN support"
@@ -201,7 +200,6 @@ class BuildCINNPass(CPPPassWrapper):
                )

            else:
-
                tmp_main_program = Executor._add_fetch_ops(
                    main_program, fetch_list, 'fetch'
                )

--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -466,7 +466,7 @@ class DistributedOpsPass(PassBase):
            PSGPU = core.PSGPU()
            try:
                gpu_slot = [int(var.name) for var in gpups_inputs]
-            except (ValueError):
+            except ValueError:
                raise ValueError(
                    "The slot name in gpups Should be able to convert to integer."
                )
@@ -922,7 +922,6 @@ class SplitHeterWorkerOpsPass(PassBase):
        first_op_index_fp = len(heter_block.ops)

        if stage_id < len(program_block_ops_list):
-
            heter_block_bp = heter_program._create_block(pre_block_idx)
            optimizer_block.append(heter_block_bp)


--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -470,7 +470,7 @@ class CommonAccessor(Accessor):
            attr_varnames = self.opt_attr_map[oop.type]
            self.accessor_class = oop.type

-        for (formal_name, shape) in param_varnames:
+        for formal_name, shape in param_varnames:
            params.append(formal_name)
            if self.accessor_class == "adam_d2sum":
                # for dims
@@ -573,7 +573,7 @@ class CommonAccessor(Accessor):
                    oop = op
                    break

-        for (attr_varname, type_) in attr_varnames:
+        for attr_varname, type_ in attr_varnames:
            value = oop.attr(attr_varname)
            attrs.append("&".join([attr_varname, str(value)]))


--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -273,7 +273,6 @@ class GpuPsProgramBuilder(PsProgramBuilder):
        super().__init__(pass_ctx)

    def _build_trainer_programs(self):
-
        add_lr_decay_table_pass = new_pass(
            "add_lr_decay_table_pass", self.attrs
        )

--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
--- a/python/paddle/distributed/transpiler/distribute_transpiler.py
+++ b/python/paddle/distributed/transpiler/distribute_transpiler.py
--- a/python/paddle/distributed/utils/log_utils.py
+++ b/python/paddle/distributed/utils/log_utils.py
--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
--- a/python/paddle/distribution/gumbel.py
+++ b/python/paddle/distribution/gumbel.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
--- a/python/paddle/fluid/dygraph/tensor_patch_methods.py
+++ b/python/paddle/fluid/dygraph/tensor_patch_methods.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
--- a/python/paddle/framework/io_utils.py
+++ b/python/paddle/framework/io_utils.py
--- a/python/paddle/geometric/math.py
+++ b/python/paddle/geometric/math.py
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
--- a/python/paddle/incubate/asp/supported_layer_list.py
+++ b/python/paddle/incubate/asp/supported_layer_list.py
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
--- a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/pserver_pass.py
--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py
--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/optimizer_factory.py
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
--- a/python/paddle/incubate/distributed/utils/io/save_for_auto.py
+++ b/python/paddle/incubate/distributed/utils/io/save_for_auto.py
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
--- a/python/paddle/incubate/operators/resnet_unit.py
+++ b/python/paddle/incubate/operators/resnet_unit.py
--- a/python/paddle/incubate/passes/ir.py
+++ b/python/paddle/incubate/passes/ir.py
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
--- a/python/paddle/io/dataloader/fetcher.py
+++ b/python/paddle/io/dataloader/fetcher.py
--- a/python/paddle/jit/dy2static/origin_info.py
+++ b/python/paddle/jit/dy2static/origin_info.py
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
--- a/python/paddle/jit/dy2static/return_transformer.py
+++ b/python/paddle/jit/dy2static/return_transformer.py
--- a/python/paddle/jit/dy2static/static_analysis.py
+++ b/python/paddle/jit/dy2static/static_analysis.py
--- a/python/paddle/jit/dy2static/tensorhook_transformer.py
+++ b/python/paddle/jit/dy2static/tensorhook_transformer.py
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
--- a/python/paddle/jit/dy2static/utils_helper.py
+++ b/python/paddle/jit/dy2static/utils_helper.py
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
--- a/python/paddle/nn/functional/flash_attention.py
+++ b/python/paddle/nn/functional/flash_attention.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
--- a/python/paddle/nn/quant/format.py
+++ b/python/paddle/nn/quant/format.py
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
--- a/python/paddle/quantization/imperative/ptq.py
+++ b/python/paddle/quantization/imperative/ptq.py
--- a/python/paddle/static/amp/bf16/amp_utils.py
+++ b/python/paddle/static/amp/bf16/amp_utils.py
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
--- a/python/paddle/static/quantization/adaround.py
+++ b/python/paddle/static/quantization/adaround.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
--- a/python/paddle/vision/models/_utils.py
+++ b/python/paddle/vision/models/_utils.py
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
--- a/setup.py
+++ b/setup.py
--- a/test/asp/test_asp_customized_pruning.py
+++ b/test/asp/test_asp_customized_pruning.py
--- a/test/asp/test_asp_optimize_dynamic.py
+++ b/test/asp/test_asp_optimize_dynamic.py
--- a/test/asp/test_asp_pruning_dynamic.py
+++ b/test/asp/test_asp_pruning_dynamic.py
--- a/test/auto_parallel/generation_pipeline_pass_unittest.py
+++ b/test/auto_parallel/generation_pipeline_pass_unittest.py
--- a/test/auto_parallel/quantization_pass_unittest.py
+++ b/test/auto_parallel/quantization_pass_unittest.py
--- a/test/auto_parallel/random_control_unittest.py
+++ b/test/auto_parallel/random_control_unittest.py
--- a/test/auto_parallel/recompute_pass_unittest.py
+++ b/test/auto_parallel/recompute_pass_unittest.py
--- a/test/auto_parallel/test_conditional_block_reshard.py
+++ b/test/auto_parallel/test_conditional_block_reshard.py
--- a/test/auto_parallel/test_dist_assign.py
+++ b/test/auto_parallel/test_dist_assign.py
--- a/test/auto_parallel/test_dist_attr_v2.py
+++ b/test/auto_parallel/test_dist_attr_v2.py
--- a/test/auto_parallel/test_dist_embedding.py
+++ b/test/auto_parallel/test_dist_embedding.py
--- a/test/auto_parallel/test_dist_reshape.py
+++ b/test/auto_parallel/test_dist_reshape.py
--- a/test/auto_parallel/test_dist_saver.py
+++ b/test/auto_parallel/test_dist_saver.py
--- a/test/auto_parallel/test_dist_scale.py
+++ b/test/auto_parallel/test_dist_scale.py
--- a/test/auto_parallel/test_dist_shape.py
+++ b/test/auto_parallel/test_dist_shape.py
--- a/test/auto_parallel/test_dist_split.py
+++ b/test/auto_parallel/test_dist_split.py
--- a/test/auto_parallel/test_engine_api_error.py
+++ b/test/auto_parallel/test_engine_api_error.py
--- a/test/auto_parallel/test_engine_save_load.py
+++ b/test/auto_parallel/test_engine_save_load.py
--- a/test/auto_parallel/test_fp16_assign.py
+++ b/test/auto_parallel/test_fp16_assign.py
--- a/test/auto_parallel/test_lr_grad_clip.py
+++ b/test/auto_parallel/test_lr_grad_clip.py
--- a/test/auto_parallel/test_prim_dist_op.py
+++ b/test/auto_parallel/test_prim_dist_op.py
--- a/test/auto_parallel/test_selective_recompute.py
+++ b/test/auto_parallel/test_selective_recompute.py
--- a/test/auto_parallel/test_serialization.py
+++ b/test/auto_parallel/test_serialization.py
--- a/test/auto_parallel/test_to_static.py
+++ b/test/auto_parallel/test_to_static.py
--- a/test/auto_parallel/test_tuning_recompute.py
+++ b/test/auto_parallel/test_tuning_recompute.py
--- a/test/auto_parallel/test_while_op_completion.py
+++ b/test/auto_parallel/test_while_op_completion.py
--- a/test/auto_parallel/test_while_op_partition.py
+++ b/test/auto_parallel/test_while_op_partition.py
--- a/test/book/test_recommender_system.py
+++ b/test/book/test_recommender_system.py
--- a/test/collective/fleet/auto_parallel_parallelizer.py
+++ b/test/collective/fleet/auto_parallel_parallelizer.py
--- a/test/collective/fleet/dygraph_dist_save_load.py
+++ b/test/collective/fleet/dygraph_dist_save_load.py
--- a/test/collective/fleet/dygraph_save_for_auto_infer.py
+++ b/test/collective/fleet/dygraph_save_for_auto_infer.py
--- a/test/collective/fleet/hybrid_parallel_inference_helper.py
+++ b/test/collective/fleet/hybrid_parallel_inference_helper.py
--- a/test/collective/fleet/hybrid_parallel_mp_model.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model.py
--- a/test/collective/fleet/hybrid_parallel_qat.py
+++ b/test/collective/fleet/hybrid_parallel_qat.py
--- a/test/collective/fleet/hybrid_parallel_sharding_model.py
+++ b/test/collective/fleet/hybrid_parallel_sharding_model.py
--- a/test/collective/fleet/parallel_class_center_sample.py
+++ b/test/collective/fleet/parallel_class_center_sample.py
--- a/test/collective/fleet/parallel_dygraph_se_resnext.py
+++ b/test/collective/fleet/parallel_dygraph_se_resnext.py
--- a/test/collective/fleet/parallel_dygraph_transformer.py
+++ b/test/collective/fleet/parallel_dygraph_transformer.py
--- a/test/collective/fleet/parallel_margin_cross_entropy.py
+++ b/test/collective/fleet/parallel_margin_cross_entropy.py
--- a/test/collective/fleet/test_auto_parallel_parallelizer.py
+++ b/test/collective/fleet/test_auto_parallel_parallelizer.py
--- a/test/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
+++ b/test/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
--- a/test/collective/fleet/test_dygraph_recompute.py
+++ b/test/collective/fleet/test_dygraph_recompute.py
--- a/test/collective/fleet/test_dygraph_recompute_for_eager.py
+++ b/test/collective/fleet/test_dygraph_recompute_for_eager.py
--- a/test/collective/fleet/test_dygraph_sharding_stage2.py
+++ b/test/collective/fleet/test_dygraph_sharding_stage2.py
--- a/test/collective/fleet/test_dygraph_sharding_stage3_for_eager.py
+++ b/test/collective/fleet/test_dygraph_sharding_stage3_for_eager.py
--- a/test/collective/fleet/test_fleet_log.py
+++ b/test/collective/fleet/test_fleet_log.py
--- a/test/collective/fleet/test_fleet_rolemaker_new.py
+++ b/test/collective/fleet/test_fleet_rolemaker_new.py
--- a/test/collective/fleet/test_mixed_precision.py
+++ b/test/collective/fleet/test_mixed_precision.py
--- a/test/collective/fleet/test_parallel_dygraph_pp_adaptor.py
+++ b/test/collective/fleet/test_parallel_dygraph_pp_adaptor.py
--- a/test/collective/fleet/test_parallel_dygraph_sharding_parallel.py
+++ b/test/collective/fleet/test_parallel_dygraph_sharding_parallel.py
--- a/test/collective/multinode/dygraph_hybrid_dpppmp.py
+++ b/test/collective/multinode/dygraph_hybrid_dpppmp.py
--- a/test/collective/multinode/dygraph_hybrid_fp16.py
+++ b/test/collective/multinode/dygraph_hybrid_fp16.py
--- a/test/collective/multinode/dygraph_hybrid_recompute.py
+++ b/test/collective/multinode/dygraph_hybrid_recompute.py
--- a/test/contrib/test_bf16_utils.py
+++ b/test/contrib/test_bf16_utils.py
--- a/test/custom_op/test_custom_op_relu_model_static_multidevice.py
+++ b/test/custom_op/test_custom_op_relu_model_static_multidevice.py
--- a/test/custom_op/test_custom_optional.py
+++ b/test/custom_op/test_custom_optional.py
--- a/test/custom_op/test_custom_relu_model.py
+++ b/test/custom_op/test_custom_relu_model.py
--- a/test/custom_op/test_multi_out_jit.py
+++ b/test/custom_op/test_multi_out_jit.py
--- a/test/custom_runtime/test_custom_cpu_to_static.py
+++ b/test/custom_runtime/test_custom_cpu_to_static.py
--- a/test/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py
+++ b/test/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py
--- a/test/distribution/test_distribution_beta_static.py
+++ b/test/distribution/test_distribution_beta_static.py
--- a/test/distribution/test_distribution_cauchy.py
+++ b/test/distribution/test_distribution_cauchy.py
--- a/test/distribution/test_distribution_gumbel.py
+++ b/test/distribution/test_distribution_gumbel.py
--- a/test/distribution/test_distribution_laplace.py
+++ b/test/distribution/test_distribution_laplace.py
--- a/test/distribution/test_distribution_laplace_static.py
+++ b/test/distribution/test_distribution_laplace_static.py
--- a/test/distribution/test_kl_static.py
+++ b/test/distribution/test_kl_static.py
--- a/test/dygraph_to_static/bert_dygraph_model.py
+++ b/test/dygraph_to_static/bert_dygraph_model.py
--- a/test/dygraph_to_static/bert_utils.py
+++ b/test/dygraph_to_static/bert_utils.py
--- a/test/dygraph_to_static/darknet.py
+++ b/test/dygraph_to_static/darknet.py
--- a/test/dygraph_to_static/seq2seq_utils.py
+++ b/test/dygraph_to_static/seq2seq_utils.py
--- a/test/dygraph_to_static/test_bmn.py
+++ b/test/dygraph_to_static/test_bmn.py
--- a/test/dygraph_to_static/test_cinn_prim_layer_norm.py
+++ b/test/dygraph_to_static/test_cinn_prim_layer_norm.py
--- a/test/dygraph_to_static/test_closure_analysis.py
+++ b/test/dygraph_to_static/test_closure_analysis.py
--- a/test/dygraph_to_static/test_cycle_gan.py
+++ b/test/dygraph_to_static/test_cycle_gan.py
--- a/test/dygraph_to_static/test_declarative.py
+++ b/test/dygraph_to_static/test_declarative.py
--- a/test/dygraph_to_static/test_dict.py
+++ b/test/dygraph_to_static/test_dict.py
--- a/test/dygraph_to_static/test_ifelse.py
+++ b/test/dygraph_to_static/test_ifelse.py
--- a/test/dygraph_to_static/test_lambda.py
+++ b/test/dygraph_to_static/test_lambda.py
--- a/test/dygraph_to_static/test_list.py
+++ b/test/dygraph_to_static/test_list.py
--- a/test/dygraph_to_static/test_loop.py
+++ b/test/dygraph_to_static/test_loop.py
--- a/test/dygraph_to_static/test_mnist.py
+++ b/test/dygraph_to_static/test_mnist.py
--- a/test/dygraph_to_static/test_mobile_net.py
+++ b/test/dygraph_to_static/test_mobile_net.py
--- a/test/dygraph_to_static/test_op_attr.py
+++ b/test/dygraph_to_static/test_op_attr.py
--- a/test/dygraph_to_static/test_partial_program.py
+++ b/test/dygraph_to_static/test_partial_program.py
--- a/test/dygraph_to_static/test_place.py
+++ b/test/dygraph_to_static/test_place.py
--- a/test/dygraph_to_static/test_program_translator.py
+++ b/test/dygraph_to_static/test_program_translator.py
--- a/test/dygraph_to_static/test_ptb_lm.py
+++ b/test/dygraph_to_static/test_ptb_lm.py
--- a/test/dygraph_to_static/test_ptb_lm_v2.py
+++ b/test/dygraph_to_static/test_ptb_lm_v2.py
--- a/test/dygraph_to_static/test_save_inference_model.py
+++ b/test/dygraph_to_static/test_save_inference_model.py
--- a/test/dygraph_to_static/test_save_load.py
+++ b/test/dygraph_to_static/test_save_load.py
--- a/test/dygraph_to_static/test_se_resnet.py
+++ b/test/dygraph_to_static/test_se_resnet.py
--- a/test/dygraph_to_static/test_seq2seq.py
+++ b/test/dygraph_to_static/test_seq2seq.py
--- a/test/dygraph_to_static/test_static_analysis.py
+++ b/test/dygraph_to_static/test_static_analysis.py
--- a/test/dygraph_to_static/test_word2vec.py
+++ b/test/dygraph_to_static/test_word2vec.py
--- a/test/dygraph_to_static/transformer_dygraph_model.py
+++ b/test/dygraph_to_static/transformer_dygraph_model.py
--- a/test/dygraph_to_static/transformer_util.py
+++ b/test/dygraph_to_static/transformer_util.py
--- a/test/fft/test_fft.py
+++ b/test/fft/test_fft.py
--- a/test/fft/test_spectral_op.py
+++ b/test/fft/test_spectral_op.py
--- a/test/ipu/test_conv2d_transpose_op_ipu.py
+++ b/test/ipu/test_conv2d_transpose_op_ipu.py
--- a/test/ipu/test_eval_model_ipu.py
+++ b/test/ipu/test_eval_model_ipu.py
--- a/test/ipu/test_modelruntime_ipu.py
+++ b/test/ipu/test_modelruntime_ipu.py
--- a/test/ipu/test_optimizer_ipu.py
+++ b/test/ipu/test_optimizer_ipu.py
--- a/test/ir/inference/auto_scan_test.py
+++ b/test/ir/inference/auto_scan_test.py
--- a/test/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/test/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
--- a/test/ir/inference/test_cutlass_conv2d_fusion_op.py
+++ b/test/ir/inference/test_cutlass_conv2d_fusion_op.py
--- a/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py
+++ b/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py
--- a/test/ir/inference/test_transfer_layout_elim_pass.py
+++ b/test/ir/inference/test_transfer_layout_elim_pass.py
--- a/test/ir/inference/test_trt_convert_affine_channel.py
+++ b/test/ir/inference/test_trt_convert_affine_channel.py
--- a/test/ir/inference/test_trt_convert_conv2d.py
+++ b/test/ir/inference/test_trt_convert_conv2d.py
--- a/test/ir/inference/test_trt_convert_conv2d_fusion.py
+++ b/test/ir/inference/test_trt_convert_conv2d_fusion.py
--- a/test/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/test/ir/inference/test_trt_convert_conv2d_transpose.py
--- a/test/ir/inference/test_trt_convert_cumsum.py
+++ b/test/ir/inference/test_trt_convert_cumsum.py
--- a/test/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/test/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
--- a/test/ir/inference/test_trt_convert_pool2d.py
+++ b/test/ir/inference/test_trt_convert_pool2d.py
--- a/test/ir/inference/test_trt_convert_reduce.py
+++ b/test/ir/inference/test_trt_convert_reduce.py
--- a/test/ir/inference/test_trt_shuffle_channel_detect_pass.py
+++ b/test/ir/inference/test_trt_shuffle_channel_detect_pass.py
--- a/test/ir/inference/test_xpu_embedding_with_eltwise_add_xpu_fuse_pass.py
+++ b/test/ir/inference/test_xpu_embedding_with_eltwise_add_xpu_fuse_pass.py
--- a/test/legacy_test/auto_parallel_data_unshard.py
+++ b/test/legacy_test/auto_parallel_data_unshard.py
--- a/test/legacy_test/auto_parallel_gpt_model.py
+++ b/test/legacy_test/auto_parallel_gpt_model.py
--- a/test/legacy_test/auto_parallel_save_load.py
+++ b/test/legacy_test/auto_parallel_save_load.py
--- a/test/legacy_test/dist_allreduce_op.py
+++ b/test/legacy_test/dist_allreduce_op.py
--- a/test/legacy_test/dist_ctr.py
+++ b/test/legacy_test/dist_ctr.py
--- a/test/legacy_test/dist_fleet_heter_pipeline_ctr.py
+++ b/test/legacy_test/dist_fleet_heter_pipeline_ctr.py
--- a/test/legacy_test/dist_fleet_sync_batch_norm.py
+++ b/test/legacy_test/dist_fleet_sync_batch_norm.py
--- a/test/legacy_test/dygraph_recompute_hybrid.py
+++ b/test/legacy_test/dygraph_recompute_hybrid.py
--- a/test/legacy_test/eager_op_test.py
+++ b/test/legacy_test/eager_op_test.py
--- a/test/legacy_test/parallel_executor_test_base.py
+++ b/test/legacy_test/parallel_executor_test_base.py
--- a/test/legacy_test/seresnext_net.py
+++ b/test/legacy_test/seresnext_net.py
--- a/test/legacy_test/test_IntermediateLayerGetter.py
+++ b/test/legacy_test/test_IntermediateLayerGetter.py
--- a/test/legacy_test/test_adam_op.py
+++ b/test/legacy_test/test_adam_op.py
--- a/test/legacy_test/test_adaptive_avg_pool2d.py
+++ b/test/legacy_test/test_adaptive_avg_pool2d.py
--- a/test/legacy_test/test_adaptive_avg_pool3d.py
+++ b/test/legacy_test/test_adaptive_avg_pool3d.py
--- a/test/legacy_test/test_adaptive_max_pool2d.py
+++ b/test/legacy_test/test_adaptive_max_pool2d.py
--- a/test/legacy_test/test_adaptive_max_pool3d.py
+++ b/test/legacy_test/test_adaptive_max_pool3d.py
--- a/test/legacy_test/test_attention_lstm_op.py
+++ b/test/legacy_test/test_attention_lstm_op.py
--- a/test/legacy_test/test_auto_parallel_partitioner.py
+++ b/test/legacy_test/test_auto_parallel_partitioner.py
--- a/test/legacy_test/test_auto_parallel_partitioner_gpt.py
+++ b/test/legacy_test/test_auto_parallel_partitioner_gpt.py
--- a/test/legacy_test/test_bicubic_interp_op.py
+++ b/test/legacy_test/test_bicubic_interp_op.py
--- a/test/legacy_test/test_bicubic_interp_v2_op.py
+++ b/test/legacy_test/test_bicubic_interp_v2_op.py
--- a/test/legacy_test/test_channel_shuffle.py
+++ b/test/legacy_test/test_channel_shuffle.py
--- a/test/legacy_test/test_chunk_op.py
+++ b/test/legacy_test/test_chunk_op.py
--- a/test/legacy_test/test_cond.py
+++ b/test/legacy_test/test_cond.py
--- a/test/legacy_test/test_conv1d_transpose_layer.py
+++ b/test/legacy_test/test_conv1d_transpose_layer.py
--- a/test/legacy_test/test_conv2d_api.py
+++ b/test/legacy_test/test_conv2d_api.py
--- a/test/legacy_test/test_conv3d_op.py
+++ b/test/legacy_test/test_conv3d_op.py
--- a/test/legacy_test/test_data.py
+++ b/test/legacy_test/test_data.py
--- a/test/legacy_test/test_detection_map_op.py
+++ b/test/legacy_test/test_detection_map_op.py
--- a/test/legacy_test/test_dist_base.py
+++ b/test/legacy_test/test_dist_base.py
--- a/test/legacy_test/test_dist_fleet_base.py
+++ b/test/legacy_test/test_dist_fleet_base.py
--- a/test/legacy_test/test_dist_fleet_infer.py
+++ b/test/legacy_test/test_dist_fleet_infer.py
--- a/test/legacy_test/test_dist_sharding_save.py
+++ b/test/legacy_test/test_dist_sharding_save.py
--- a/test/legacy_test/test_dist_transpiler.py
+++ b/test/legacy_test/test_dist_transpiler.py
--- a/test/legacy_test/test_dot_op.py
+++ b/test/legacy_test/test_dot_op.py
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
--- a/test/legacy_test/test_eager_deletion_padding_rnn.py
+++ b/test/legacy_test/test_eager_deletion_padding_rnn.py
--- a/test/legacy_test/test_eigvalsh_op.py
+++ b/test/legacy_test/test_eigvalsh_op.py
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
--- a/test/legacy_test/test_elementwise_sub_op.py
+++ b/test/legacy_test/test_elementwise_sub_op.py
--- a/test/legacy_test/test_executor_check_fetch_list.py
+++ b/test/legacy_test/test_executor_check_fetch_list.py
--- a/test/legacy_test/test_feed_data_check_shape_type.py
+++ b/test/legacy_test/test_feed_data_check_shape_type.py
--- a/test/legacy_test/test_fill_constant_op.py
+++ b/test/legacy_test/test_fill_constant_op.py
--- a/test/legacy_test/test_fleet_base_4.py
+++ b/test/legacy_test/test_fleet_base_4.py
--- a/test/legacy_test/test_fold_op.py
+++ b/test/legacy_test/test_fold_op.py
--- a/test/legacy_test/test_fuse_gemm_epilogue_pass.py
+++ b/test/legacy_test/test_fuse_gemm_epilogue_pass.py
--- a/test/legacy_test/test_fused_adam_op.py
+++ b/test/legacy_test/test_fused_adam_op.py
--- a/test/legacy_test/test_fusion_seqexpand_concat_fc_op.py
+++ b/test/legacy_test/test_fusion_seqexpand_concat_fc_op.py
--- a/test/legacy_test/test_gather_nd_op.py
+++ b/test/legacy_test/test_gather_nd_op.py
--- a/test/legacy_test/test_gather_op.py
+++ b/test/legacy_test/test_gather_op.py
--- a/test/legacy_test/test_get_set_flags.py
+++ b/test/legacy_test/test_get_set_flags.py
--- a/test/legacy_test/test_grad_clip_minimize.py
+++ b/test/legacy_test/test_grad_clip_minimize.py
--- a/test/legacy_test/test_group_norm_op.py
+++ b/test/legacy_test/test_group_norm_op.py
--- a/test/legacy_test/test_iinfo_and_finfo.py
+++ b/test/legacy_test/test_iinfo_and_finfo.py
--- a/test/legacy_test/test_imperative_auto_prune.py
+++ b/test/legacy_test/test_imperative_auto_prune.py
--- a/test/legacy_test/test_imperative_ocr_attention_model.py
+++ b/test/legacy_test/test_imperative_ocr_attention_model.py
--- a/test/legacy_test/test_imperative_recurrent_usage.py
+++ b/test/legacy_test/test_imperative_recurrent_usage.py
--- a/test/legacy_test/test_imperative_se_resnext.py
+++ b/test/legacy_test/test_imperative_se_resnext.py
--- a/test/legacy_test/test_imperative_transformer_sorted_gradient.py
+++ b/test/legacy_test/test_imperative_transformer_sorted_gradient.py
--- a/test/legacy_test/test_ir_inplace_pass.py
+++ b/test/legacy_test/test_ir_inplace_pass.py
--- a/test/legacy_test/test_jit_layer.py
+++ b/test/legacy_test/test_jit_layer.py
--- a/test/legacy_test/test_kldiv_loss_op.py
+++ b/test/legacy_test/test_kldiv_loss_op.py
--- a/test/legacy_test/test_layers.py
+++ b/test/legacy_test/test_layers.py
--- a/test/legacy_test/test_learning_rate_scheduler.py
+++ b/test/legacy_test/test_learning_rate_scheduler.py
--- a/test/legacy_test/test_linalg_pinv_op.py
+++ b/test/legacy_test/test_linalg_pinv_op.py
--- a/test/legacy_test/test_linear_interp_op.py
+++ b/test/legacy_test/test_linear_interp_op.py
--- a/test/legacy_test/test_linear_interp_v2_op.py
+++ b/test/legacy_test/test_linear_interp_v2_op.py
--- a/test/legacy_test/test_logcumsumexp_op.py
+++ b/test/legacy_test/test_logcumsumexp_op.py
--- a/test/legacy_test/test_lr_scheduler.py
+++ b/test/legacy_test/test_lr_scheduler.py
--- a/test/legacy_test/test_masked_select_op.py
+++ b/test/legacy_test/test_masked_select_op.py
--- a/test/legacy_test/test_metrics.py
+++ b/test/legacy_test/test_metrics.py
--- a/test/legacy_test/test_multiclass_nms_op.py
+++ b/test/legacy_test/test_multiclass_nms_op.py
--- a/test/legacy_test/test_mv_op.py
+++ b/test/legacy_test/test_mv_op.py
--- a/test/legacy_test/test_nan_to_num_op.py
+++ b/test/legacy_test/test_nan_to_num_op.py
--- a/test/legacy_test/test_ops_roi_pool.py
+++ b/test/legacy_test/test_ops_roi_pool.py
--- a/test/legacy_test/test_optimizer_grad.py
+++ b/test/legacy_test/test_optimizer_grad.py
--- a/test/legacy_test/test_parallel_dygraph_dataparallel.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel.py
--- a/test/legacy_test/test_pixel_shuffle_op.py
+++ b/test/legacy_test/test_pixel_shuffle_op.py
--- a/test/legacy_test/test_pool1d_api.py
+++ b/test/legacy_test/test_pool1d_api.py
--- a/test/legacy_test/test_pool2d_api.py
+++ b/test/legacy_test/test_pool2d_api.py
--- a/test/legacy_test/test_pool2d_op.py
+++ b/test/legacy_test/test_pool2d_op.py
--- a/test/legacy_test/test_pool3d_api.py
+++ b/test/legacy_test/test_pool3d_api.py
--- a/test/legacy_test/test_pool3d_op.py
+++ b/test/legacy_test/test_pool3d_op.py
--- a/test/legacy_test/test_pool_max_op.py
+++ b/test/legacy_test/test_pool_max_op.py
--- a/test/legacy_test/test_pull_gpups_sparse_op.py
+++ b/test/legacy_test/test_pull_gpups_sparse_op.py
--- a/test/legacy_test/test_pybind_interface.py
+++ b/test/legacy_test/test_pybind_interface.py
--- a/test/legacy_test/test_quantile_and_nanquantile.py
+++ b/test/legacy_test/test_quantile_and_nanquantile.py
--- a/test/legacy_test/test_random_seed.py
+++ b/test/legacy_test/test_random_seed.py
--- a/test/legacy_test/test_rnn_cell_api.py
+++ b/test/legacy_test/test_rnn_cell_api.py
--- a/test/legacy_test/test_roi_perspective_transform_op.py
+++ b/test/legacy_test/test_roi_perspective_transform_op.py
--- a/test/legacy_test/test_row_conv_op.py
+++ b/test/legacy_test/test_row_conv_op.py
--- a/test/legacy_test/test_searchsorted_op.py
+++ b/test/legacy_test/test_searchsorted_op.py
--- a/test/legacy_test/test_select_input_output_op.py
+++ b/test/legacy_test/test_select_input_output_op.py
--- a/test/legacy_test/test_sigmoid_focal_loss.py
+++ b/test/legacy_test/test_sigmoid_focal_loss.py
--- a/test/legacy_test/test_signal.py
+++ b/test/legacy_test/test_signal.py
--- a/test/legacy_test/test_sparse_attention_op.py
+++ b/test/legacy_test/test_sparse_attention_op.py
--- a/test/legacy_test/test_split_op.py
+++ b/test/legacy_test/test_split_op.py
--- a/test/legacy_test/test_square_error_cost.py
+++ b/test/legacy_test/test_square_error_cost.py
--- a/test/legacy_test/test_static_save_load.py
+++ b/test/legacy_test/test_static_save_load.py
--- a/test/legacy_test/test_strided_slice_op.py
+++ b/test/legacy_test/test_strided_slice_op.py
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
--- a/test/legacy_test/test_target_assign_op.py
+++ b/test/legacy_test/test_target_assign_op.py
--- a/test/legacy_test/test_transformer_api.py
+++ b/test/legacy_test/test_transformer_api.py
--- a/test/legacy_test/test_transforms_static.py
+++ b/test/legacy_test/test_transforms_static.py
--- a/test/legacy_test/test_truncated_gaussian_random_op.py
+++ b/test/legacy_test/test_truncated_gaussian_random_op.py
--- a/test/legacy_test/test_unpool1d_op.py
+++ b/test/legacy_test/test_unpool1d_op.py
--- a/test/legacy_test/test_unpool3d_op.py
+++ b/test/legacy_test/test_unpool3d_op.py
--- a/test/legacy_test/test_unstack_op.py
+++ b/test/legacy_test/test_unstack_op.py
--- a/test/legacy_test/test_var_conv_2d.py
+++ b/test/legacy_test/test_var_conv_2d.py
--- a/test/legacy_test/test_variable.py
+++ b/test/legacy_test/test_variable.py
--- a/test/legacy_test/test_vision_models.py
+++ b/test/legacy_test/test_vision_models.py
--- a/test/legacy_test/test_warpctc_op.py
+++ b/test/legacy_test/test_warpctc_op.py
--- a/test/legacy_test/test_zero_dim_tensor.py
+++ b/test/legacy_test/test_zero_dim_tensor.py
--- a/test/legacy_test/test_zeros_like_op.py
+++ b/test/legacy_test/test_zeros_like_op.py
--- a/test/legacy_test/testsuite.py
+++ b/test/legacy_test/testsuite.py
--- a/test/legacy_test/transformer_model.py
+++ b/test/legacy_test/transformer_model.py
--- a/test/mkldnn/mkldnn_op_test.py
+++ b/test/mkldnn/mkldnn_op_test.py
--- a/test/mkldnn/test_batch_norm_mkldnn_op.py
+++ b/test/mkldnn/test_batch_norm_mkldnn_op.py
--- a/test/mkldnn/test_concat_int8_mkldnn_op.py
+++ b/test/mkldnn/test_concat_int8_mkldnn_op.py
--- a/test/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
--- a/test/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_transpose_mkldnn_op.py
--- a/test/prim/composite_ops/test_composite_layer_norm.py
+++ b/test/prim/composite_ops/test_composite_layer_norm.py
--- a/test/prim/composite_ops/test_composite_layer_norm_grad.py
+++ b/test/prim/composite_ops/test_composite_layer_norm_grad.py
--- a/test/prim/model/bert.py
+++ b/test/prim/model/bert.py
--- a/test/prim/prim/vjp/static/test_comp_multiply_grad.py
+++ b/test/prim/prim/vjp/static/test_comp_multiply_grad.py
--- a/test/ps/dataset_generator_criteo.py
+++ b/test/ps/dataset_generator_criteo.py
--- a/test/ps/ps_dnn_model.py
+++ b/test/ps/ps_dnn_model.py
--- a/test/quantization/test_imperative_qat_lsq.py
+++ b/test/quantization/test_imperative_qat_lsq.py
--- a/test/quantization/test_imperative_qat_matmul.py
+++ b/test/quantization/test_imperative_qat_matmul.py
--- a/test/quantization/test_post_training_quantization_lstm_model.py
+++ b/test/quantization/test_post_training_quantization_lstm_model.py
--- a/test/quantization/test_post_training_quantization_mnist.py
+++ b/test/quantization/test_post_training_quantization_mnist.py
--- a/test/quantization/test_post_training_quantization_while.py
+++ b/test/quantization/test_post_training_quantization_while.py
--- a/test/quantization/test_ptq.py
+++ b/test/quantization/test_ptq.py
--- a/test/quantization/test_trace_quanter.py
+++ b/test/quantization/test_trace_quanter.py
--- a/test/quantization/test_weight_quantization_mobilenetv1.py
+++ b/test/quantization/test_weight_quantization_mobilenetv1.py
--- a/test/rnn/test_rnn_api.py
+++ b/test/rnn/test_rnn_api.py
--- a/test/tokenizer/bert_tokenizer.py
+++ b/test/tokenizer/bert_tokenizer.py
--- a/test/tokenizer/test_faster_tokenizer_op.py
+++ b/test/tokenizer/test_faster_tokenizer_op.py
--- a/test/tokenizer/tokenizer_utils.py
+++ b/test/tokenizer/tokenizer_utils.py
--- a/test/xpu/op_test_xpu.py
+++ b/test/xpu/op_test_xpu.py
--- a/test/xpu/test_conv3d_op_xpu.py
+++ b/test/xpu/test_conv3d_op_xpu.py
--- a/test/xpu/test_dropout_op_xpu.py
+++ b/test/xpu/test_dropout_op_xpu.py
--- a/test/xpu/test_kldiv_loss_op_xpu.py
+++ b/test/xpu/test_kldiv_loss_op_xpu.py
--- a/test/xpu/test_masked_select_op_xpu.py
+++ b/test/xpu/test_masked_select_op_xpu.py
--- a/test/xpu/test_pool2d_op_xpu.py
+++ b/test/xpu/test_pool2d_op_xpu.py
--- a/test/xpu/test_pool3d_op_xpu.py
+++ b/test/xpu/test_pool3d_op_xpu.py
--- a/test/xpu/test_pool_max_op_xpu.py
+++ b/test/xpu/test_pool_max_op_xpu.py
--- a/test/xpu/test_truncated_gaussian_random_op_xpu.py
+++ b/test/xpu/test_truncated_gaussian_random_op_xpu.py
--- a/test/xpu/test_unstack_op_xpu.py
+++ b/test/xpu/test_unstack_op_xpu.py
--- a/test/xpu/test_warpctc_op_xpu.py
+++ b/test/xpu/test_warpctc_op_xpu.py
--- a/tools/CrossStackProfiler/NetFileReader.py
+++ b/tools/CrossStackProfiler/NetFileReader.py
--- a/tools/CrossStackProfiler/ProfileFileReader.py
+++ b/tools/CrossStackProfiler/ProfileFileReader.py
--- a/tools/check_op_kernel_same_dtypes.py
+++ b/tools/check_op_kernel_same_dtypes.py
--- a/tools/check_op_register_type.py
+++ b/tools/check_op_register_type.py
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
--- a/tools/prune_for_jetson.py
+++ b/tools/prune_for_jetson.py
--- a/tools/remove_grad_op_and_kernel.py
+++ b/tools/remove_grad_op_and_kernel.py
--- a/tools/test_run_by_protobuf_3.py
+++ b/tools/test_run_by_protobuf_3.py